summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src/arm
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/arm')
-rw-r--r--third_party/dav1d/src/arm/32/cdef.S540
-rw-r--r--third_party/dav1d/src/arm/32/cdef16.S233
-rw-r--r--third_party/dav1d/src/arm/32/cdef_tmpl.S515
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain.S2039
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain16.S2137
-rw-r--r--third_party/dav1d/src/arm/32/ipred.S2958
-rw-r--r--third_party/dav1d/src/arm/32/ipred16.S3276
-rw-r--r--third_party/dav1d/src/arm/32/itx.S3343
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S3625
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter.S868
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter16.S859
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration.S791
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration16.S801
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_common.S453
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_tmpl.S600
-rw-r--r--third_party/dav1d/src/arm/32/mc.S3340
-rw-r--r--third_party/dav1d/src/arm/32/mc16.S3658
-rw-r--r--third_party/dav1d/src/arm/32/msac.S575
-rw-r--r--third_party/dav1d/src/arm/32/refmvs.S303
-rw-r--r--third_party/dav1d/src/arm/32/util.S184
-rw-r--r--third_party/dav1d/src/arm/64/cdef.S520
-rw-r--r--third_party/dav1d/src/arm/64/cdef16.S229
-rw-r--r--third_party/dav1d/src/arm/64/cdef_tmpl.S511
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain.S2010
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain16.S1997
-rw-r--r--third_party/dav1d/src/arm/64/ipred.S5294
-rw-r--r--third_party/dav1d/src/arm/64/ipred16.S5674
-rw-r--r--third_party/dav1d/src/arm/64/itx.S3270
-rw-r--r--third_party/dav1d/src/arm/64/itx16.S3648
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter.S1129
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter16.S925
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration.S1303
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration16.S1388
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_common.S272
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_tmpl.S751
-rw-r--r--third_party/dav1d/src/arm/64/mc.S3310
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S3611
-rw-r--r--third_party/dav1d/src/arm/64/msac.S480
-rw-r--r--third_party/dav1d/src/arm/64/refmvs.S292
-rw-r--r--third_party/dav1d/src/arm/64/util.S229
-rw-r--r--third_party/dav1d/src/arm/asm-offsets.h43
-rw-r--r--third_party/dav1d/src/arm/asm.S291
-rw-r--r--third_party/dav1d/src/arm/cdef.h88
-rw-r--r--third_party/dav1d/src/arm/cpu.c99
-rw-r--r--third_party/dav1d/src/arm/cpu.h37
-rw-r--r--third_party/dav1d/src/arm/filmgrain.h204
-rw-r--r--third_party/dav1d/src/arm/ipred.h326
-rw-r--r--third_party/dav1d/src/arm/itx.h141
-rw-r--r--third_party/dav1d/src/arm/loopfilter.h45
-rw-r--r--third_party/dav1d/src/arm/looprestoration.h1113
-rw-r--r--third_party/dav1d/src/arm/mc.h114
-rw-r--r--third_party/dav1d/src/arm/msac.h52
-rw-r--r--third_party/dav1d/src/arm/refmvs.h41
53 files changed, 70535 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S
new file mode 100644
index 0000000000..4a0df6eac8
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef.S
@@ -0,0 +1,540 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s1, #\w]
+ vmov.16 d4[1], r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmov.16 d4[2], r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[3], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ ldrh r12, [\s1, #\w]
+ vldr \n2, [\s2]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ vld1.32 {\dst\()[0]}, [\src, :32], \incr
+.else
+ vld1.8 {\dst\()}, [\src, :64], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro padding_func w, stride, n1, w1, n2, w2, align
+function cdef_padding\w\()_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ cmp r7, #0xf // fully edged
+ beq cdef_padding\w\()_edged_8bpc_neon
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.16 {d2[]}, [r3, :16]!
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmov.16 d2[1], r12
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {d2[]}, [r3, :16]!
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ vdup.16 d2, r12
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+endfunc
+.endm
+
+padding_func 8, 16, d0, q0, d2, q1, 128
+padding_func 4, 8, s0, d0, s4, d2, 64
+
+// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg, align
+function cdef_padding\w\()_edged_8bpc_neon
+ sub r0, r0, #(2*\stride)
+
+ ldrh r12, [r4, #-2]
+ vldr \reg, [r4]
+ add r8, r4, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r4, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+ add r0, r0, #2*\stride
+
+0:
+ ldrh r12, [r3], #2
+ vldr \reg, [r1]
+ str r12, [r0, #-2]
+ ldrh r12, [r1, #\w]
+ add r1, r1, r2
+ subs r6, r6, #1
+ vstr \reg, [r0]
+ str r12, [r0, #\w]
+ add r0, r0, #\stride
+ bgt 0b
+
+ ldrh r12, [r5, #-2]
+ vldr \reg, [r5]
+ add r8, r5, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r5, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+padding_func_edged 8, 16, d0, 64
+padding_func_edged 4, 8, s0, 32
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.8 {\d11}, [r6] // p0
+ add r6, r6, #16 // += stride
+ vld1.8 {\d21}, [r9] // p1
+ add r9, r9, #16 // += stride
+ vld1.8 {\d12}, [r6] // p0
+ vld1.8 {\d22}, [r9] // p1
+.else
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.32 {\d11[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d11[1]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[1]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d22[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[1]}, [r6] // p0
+ vld1.32 {\d22[1]}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u8 q3, q3, \s1
+ vmax.u8 q4, q4, \s1
+ vmin.u8 q3, q3, \s2
+ vmax.u8 q4, q4, \s2
+.endif
+ vabd.u8 q8, q0, \s1 // abs(diff)
+ vabd.u8 q11, q0, \s2 // abs(diff)
+ vshl.u8 q9, q8, \shift // abs(diff) >> shift
+ vshl.u8 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vcgt.u8 q10, q0, \s1 // px > p0
+ vcgt.u8 q13, q0, \s2 // px > p1
+ vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
+ vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
+ vneg.s8 q8, q9 // -imin()
+ vneg.s8 q11, q12 // -imin()
+ vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
+ vdup.8 d18, \tap // taps[k]
+ vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
+ vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_neon
+.if \pri
+ movrel_local r8, pri_taps
+ and r9, r3, #1
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u8 d17, #7
+ vdup.8 d16, r6 // damping
+
+ vmov.8 d8[0], r3
+ vmov.8 d8[1], r4
+ vclz.i8 d8, d8 // clz(threshold)
+ vsub.i8 d8, d17, d8 // ulog2(threshold)
+ vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s8 d8, d8 // -shift
+.if \sec
+ vdup.8 q6, d8[1]
+.endif
+.if \pri
+ vdup.8 q5, d8[0]
+.endif
+
+1:
+.if \w == 8
+ add r12, r2, #16
+ vld1.8 {d0}, [r2, :64] // px
+ vld1.8 {d1}, [r12, :64] // px
+.else
+ add r12, r2, #8
+ vld1.32 {d0[0]}, [r2, :32] // px
+ add r9, r2, #2*8
+ vld1.32 {d0[1]}, [r12, :32] // px
+ add r12, r12, #2*8
+ vld1.32 {d1[0]}, [r9, :32] // px
+ vld1.32 {d1[1]}, [r12, :32] // px
+.endif
+
+ vmov.u8 q1, #0 // sum
+ vmov.u8 q2, #0 // sum
+.if \min
+ vmov.u16 q3, q0 // min
+ vmov.u16 q4, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px_8 d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+ vdup.8 q7, r3 // threshold
+
+ handle_pixel_8 q14, q15, q7, q5, r12, \min
+.endif
+
+.if \sec
+ load_px_8 d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ vdup.8 q7, r4 // threshold
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ load_px_8 d28, d29, d30, d31, \w
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vshr.s16 q15, q2, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vadd.i16 q2, q2, q15 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
+ vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
+ vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+.if \min
+ vmin.u8 q0, q0, q4
+ vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ vst1.8 {d0}, [r0, :64], r1
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+ vst1.8 {d1}, [r0, :64], r1
+.else
+ vst1.32 {d0[0]}, [r0, :32], r1
+ add r2, r2, #4*8 // tmp += 4*tmp_stride
+ vst1.32 {d0[1]}, [r0, :32], r1
+ subs r7, r7, #4 // h -= 4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/32/cdef16.S b/third_party/dav1d/src/arm/32/cdef16.S
new file mode 100644
index 0000000000..d14525d720
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef16.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s1, #2*\w]
+ vldr s10, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s11, [\s2, #2*\w]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s8, [\s1, #2*\w]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s9, [\s2, #2*\w]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro padding_func_16 w, stride, r1, r2, align
+function cdef_padding\w\()_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.32 {d2[]}, [r3, :32]!
+ vldr s5, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.32 {d2[]}, [r3, :32]!
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vldr s4, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q0, q1, 128
+padding_func_16 4, 8, d0, d2, 64
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/32/cdef_tmpl.S b/third_party/dav1d/src/arm/32/cdef_tmpl.S
new file mode 100644
index 0000000000..33ff9e5816
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef_tmpl.S
@@ -0,0 +1,515 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11,\d12}, [r6] // p0
+ vld1.16 {\d21,\d22}, [r9] // p1
+.else
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11}, [r6] // p0
+ add r6, r6, #2*8 // += stride
+ vld1.16 {\d21}, [r9] // p1
+ add r9, r9, #2*8 // += stride
+ vld1.16 {\d12}, [r6] // p0
+ vld1.16 {\d22}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u16 q2, q2, \s1
+ vmax.s16 q3, q3, \s1
+ vmin.u16 q2, q2, \s2
+ vmax.s16 q3, q3, \s2
+.endif
+ vabd.u16 q8, q0, \s1 // abs(diff)
+ vabd.u16 q11, q0, \s2 // abs(diff)
+ vshl.u16 q9, q8, \shift // abs(diff) >> shift
+ vshl.u16 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vsub.i16 q10, \s1, q0 // diff = p0 - px
+ vsub.i16 q13, \s2, q0 // diff = p1 - px
+ vneg.s16 q8, q9 // -clip
+ vneg.s16 q11, q12 // -clip
+ vmin.s16 q10, q10, q9 // imin(diff, clip)
+ vmin.s16 q13, q13, q12 // imin(diff, clip)
+ vdup.16 q9, \tap // taps[k]
+ vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
+ vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
+ vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
+ vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ cmp r8, #0xf
+ beq cdef_filter\w\suffix\()_edged_neon
+.endif
+.if \pri
+.if \bpc == 16
+ clz r9, r9
+ sub r9, r9, #24 // -bitdepth_min_8
+ neg r9, r9 // bitdepth_min_8
+.endif
+ movrel_local r8, pri_taps
+.if \bpc == 16
+ lsr r9, r3, r9 // pri_strength >> bitdepth_min_8
+ and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and r9, r3, #1
+.endif
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u16 d17, #15
+ vdup.16 d16, r6 // damping
+
+.if \pri
+ vdup.16 q5, r3 // threshold
+.endif
+.if \sec
+ vdup.16 q7, r4 // threshold
+.endif
+ vmov.16 d8[0], r3
+ vmov.16 d8[1], r4
+ vclz.i16 d8, d8 // clz(threshold)
+ vsub.i16 d8, d17, d8 // ulog2(threshold)
+ vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s16 d8, d8 // -shift
+.if \sec
+ vdup.16 q6, d8[1]
+.endif
+.if \pri
+ vdup.16 q4, d8[0]
+.endif
+
+1:
+.if \w == 8
+ vld1.16 {q0}, [r2, :128] // px
+.else
+ add r12, r2, #2*8
+ vld1.16 {d0}, [r2, :64] // px
+ vld1.16 {d1}, [r12, :64] // px
+.endif
+
+ vmov.u16 q1, #0 // sum
+.if \min
+ vmov.u16 q2, q0 // min
+ vmov.u16 q3, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+
+ handle_pixel q14, q15, q5, q4, r12, \min
+.endif
+
+.if \sec
+ load_px d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ load_px d28, d29, d30, d31, \w
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
+.if \min
+ vmin.s16 q0, q0, q3
+ vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ vmovn.u16 d0, q0
+.endif
+.if \w == 8
+ add r2, r2, #2*16 // tmp += tmp_stride
+ subs r7, r7, #1 // h--
+.if \bpc == 8
+ vst1.8 {d0}, [r0, :64], r1
+.else
+ vst1.16 {q0}, [r0, :128], r1
+.endif
+.else
+.if \bpc == 8
+ vst1.32 {d0[0]}, [r0, :32], r1
+.else
+ vst1.16 {d0}, [r0, :64], r1
+.endif
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+.if \bpc == 8
+ vst1.32 {d0[1]}, [r0, :32], r1
+.else
+ vst1.16 {d1}, [r0, :64], r1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #92]
+ ldrd r6, r7, [sp, #100]
+.if \bpc == 16
+ ldrd r8, r9, [sp, #108]
+.else
+ ldr r8, [sp, #108]
+.endif
+ cmp r3, #0 // pri_strength
+ bne 1f
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cmp r4, #0 // sec_strength
+ bne 1f
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table, align=4
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+ vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q2, \s2, \s2
+ vmull.s16 q3, \s3, \s3
+ vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q12, \s5, \s5
+ vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
+ vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
+ vmla.i32 q1, q2, q14
+ vmla.i32 q1, q3, q15
+ vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
+ vmla.i32 q5, q12, q14
+ vmla.i32 q5, q6, q15
+ vadd.i32 d2, d2, d3
+ vadd.i32 d3, d10, d11
+ vpadd.i32 \dest, d2, d3 // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ vmov.32 lr, \s2
+.endif
+ cmp r12, r1 // cost[n] > best_cost
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, r12 // best_cost = cost[n]
+.ifnb \s2
+ add r3, r3, #1 // n++
+ cmp lr, r1 // cost[n] > best_cost
+ vmov.32 r12, \s3
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, lr // best_cost = cost[n]
+ add r3, r3, #1 // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+ push {lr}
+ vpush {q4-q7}
+.if \bpc == 16
+ clz r3, r3 // clz(bitdepth_max)
+ sub lr, r3, #24 // -bitdepth_min_8
+.endif
+ sub sp, sp, #32 // cost
+ mov r3, #8
+ vmov.u16 q1, #0 // q0-q1 sum_diag[0]
+ vmov.u16 q3, #0 // q2-q3 sum_diag[1]
+ vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
+ vmov.u16 q8, #0 // q6,d16 sum_alt[0]
+ // q7,d17 sum_alt[1]
+ vmov.u16 q9, #0 // q9,d22 sum_alt[2]
+ vmov.u16 q11, #0
+ vmov.u16 q10, #0 // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+.if \bpc == 8
+ vld1.8 {d30}, [r0, :64], r1
+ vmov.u8 d31, #128
+ vsubl.u8 q15, d30, d31 // img[x] - 128
+.else
+ vld1.16 {q15}, [r0, :128], r1
+ vdup.16 q14, lr // -bitdepth_min_8
+ vshl.u16 q15, q15, q14
+ vmov.u16 q14, #128
+ vsub.i16 q15, q15, q14 // img[x] - 128
+.endif
+ vmov.u16 q14, #0
+
+.if \i == 0
+ vmov q0, q15 // sum_diag[0]
+.else
+ vext.8 q12, q14, q15, #(16-2*\i)
+ vext.8 q13, q15, q14, #(16-2*\i)
+ vadd.i16 q0, q0, q12 // sum_diag[0]
+ vadd.i16 q1, q1, q13 // sum_diag[0]
+.endif
+ vrev64.16 q13, q15
+ vswp d26, d27 // [-x]
+.if \i == 0
+ vmov q2, q13 // sum_diag[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q2, q2, q12 // sum_diag[1]
+ vadd.i16 q3, q3, q13 // sum_diag[1]
+.endif
+
+ vpadd.u16 d26, d30, d31 // [(x >> 1)]
+ vmov.u16 d27, #0
+ vpadd.u16 d24, d26, d28
+ vpadd.u16 d24, d24, d28 // [y]
+ vmov.u16 r12, d24[0]
+ vadd.i16 q5, q5, q15 // sum_hv[1]
+.if \i < 4
+ vmov.16 d8[\i], r12 // sum_hv[0]
+.else
+ vmov.16 d9[\i-4], r12 // sum_hv[0]
+.endif
+
+.if \i == 0
+ vmov.u16 q6, q13 // sum_alt[0]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q14, q13, q14, #(16-2*\i)
+ vadd.i16 q6, q6, q12 // sum_alt[0]
+ vadd.i16 d16, d16, d28 // sum_alt[0]
+.endif
+ vrev64.16 d26, d26 // [-(x >> 1)]
+ vmov.u16 q14, #0
+.if \i == 0
+ vmov q7, q13 // sum_alt[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q7, q7, q12 // sum_alt[1]
+ vadd.i16 d17, d17, d26 // sum_alt[1]
+.endif
+
+.if \i < 6
+ vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
+ vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
+ vadd.i16 q9, q9, q12 // sum_alt[2]
+ vadd.i16 d22, d22, d26 // sum_alt[2]
+.else
+ vadd.i16 q9, q9, q15 // sum_alt[2]
+.endif
+.if \i == 0
+ vmov q10, q15 // sum_alt[3]
+.elseif \i == 1
+ vadd.i16 q10, q10, q15 // sum_alt[3]
+.else
+ vext.8 q12, q14, q15, #(16-2*(\i/2))
+ vext.8 q13, q15, q14, #(16-2*(\i/2))
+ vadd.i16 q10, q10, q12 // sum_alt[3]
+ vadd.i16 d23, d23, d26 // sum_alt[3]
+.endif
+.endr
+
+ vmov.u32 q15, #105
+
+ vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
+ vmlal.s16 q12, d9, d9
+ vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
+ vmlal.s16 q13, d11, d11
+ vadd.s32 d8, d24, d25
+ vadd.s32 d9, d26, d27
+ vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
+ vmul.i32 d8, d8, d30 // cost[2,6] *= 105
+
+ vrev64.16 q1, q1
+ vrev64.16 q3, q3
+ vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
+ vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
+
+ vstr s16, [sp, #2*4] // cost[2]
+ vstr s17, [sp, #6*4] // cost[6]
+
+ movrel_local r12, div_table
+ vld1.16 {q14}, [r12, :128]
+
+ vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
+ vmull.s16 q12, d1, d1
+ vmlal.s16 q5, d2, d2
+ vmlal.s16 q12, d3, d3
+ vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
+ vmull.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmovl.u16 q13, d28 // div_table
+ vmovl.u16 q14, d29
+ vmul.i32 q5, q5, q13 // cost[0]
+ vmla.i32 q5, q12, q14
+ vmul.i32 q0, q0, q13 // cost[4]
+ vmla.i32 q0, q1, q14
+ vadd.i32 d10, d10, d11
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
+
+ movrel_local r12, alt_fact
+ vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+ vstr s0, [sp, #0*4] // cost[0]
+ vstr s1, [sp, #4*4] // cost[4]
+
+ vmovl.u16 q13, d29 // div_table[2*m+1] + 105
+ vmovl.u16 q14, d30
+ vmovl.u16 q15, d31
+
+ cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+ cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+ vstr s28, [sp, #1*4] // cost[1]
+ vstr s29, [sp, #3*4] // cost[3]
+
+ mov r0, #0 // best_dir
+ vmov.32 r1, d0[0] // best_cost
+ mov r3, #1 // n
+
+ vstr s30, [sp, #5*4] // cost[5]
+ vstr s31, [sp, #7*4] // cost[7]
+
+ vmov.32 r12, d14[0]
+
+ find_best d14[0], d8[0], d14[1]
+ find_best d14[1], d0[1], d15[0]
+ find_best d15[0], d8[1], d15[1]
+ find_best d15[1]
+
+ eor r3, r0, #4 // best_dir ^4
+ ldr r12, [sp, r3, lsl #2]
+ sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
+ lsr r1, r1, #10
+ str r1, [r2] // *var
+
+ add sp, sp, #32
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000..9d59d5d5ed
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r5, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r6, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r7, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r8, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r9, q0
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5, \r6, \r7}, [r0]!
+ vst1.16 {\r8, \r9}, [r0]!
+ vst1.16 {\r10[0]}, [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ read_rand r12, 11, 0
+ vld1.16 {d0[2]}, [r11]
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5}, [r0]
+ add r0, r0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 d0, q0
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mov lr, #-128
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #1
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.8 d1[7], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ vmull.s8 q2, d6, d28
+ vmull.s8 q3, d7, d28
+ vmull.s8 q4, d0, d27
+ vmull.s8 q5, d1, d27
+
+ vaddl.s16 q0, d4, d8
+ vaddl.s16 q2, d5, d9
+ vaddl.s16 q4, d6, d10
+ vaddl.s16 q5, d7, d11
+
+ vmull.s8 q3, d3, d29
+ vmull.s8 q1, d2, d29
+
+ vaddw.s16 q4, q4, d6
+ vaddw.s16 q5, q5, d7
+ vaddw.s16 q3, q2, d3
+ vaddw.s16 q2, q0, d2
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vpaddl.s8 q6, q6
+ vpaddl.s8 q7, q7
+ vadd.i16 q0, q0, q6
+ vadd.i16 q1, q1, q7
+ vpop {q6-q7}
+ vrshrn.s16 d0, q0, #2
+ vrshrn.s16 d1, q1, #2
+.endif
+.ifc \type, uv_422
+ vld1.8 {q0, q1}, [r11]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vrshrn.s16 d0, q0, #1
+ vrshrn.s16 d1, q1, #1
+.endif
+.ifc \type, uv_444
+ vld1.8 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+.endif
+ vmull.s8 q1, d0, d13
+ vmull.s8 q0, d1, d13
+ vaddw.s16 q2, q2, d2
+ vaddw.s16 q3, q3, d3
+ vaddw.s16 q4, q4, d0
+ vaddw.s16 q5, q5, d1
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vmovn.i16 d1, q0
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s8 r10, d1[5]
+.endif
+.ifnc \lag, lag1
+ vmov.s8 r8, d1[6]
+.endif
+ vmov.s8 r6, d1[7]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q4
+.if \elems == 9
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vmovn.i16 d2, q1
+ vext.8 q0, q0, q1, #7
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q5
+
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r11, 11
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #1
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ vst1.8 {q0}, [r0]!
+.endif
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ vmov q3, \mid
+ vext.8 q0, \left, \mid, #15
+ vext.8 q1, \mid, \right, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ vmov \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH - 16
+ sub lr, r0, #1*GRAIN_WIDTH - 16
+ vld1.8 {q10}, [r12] // load top right
+ vld1.8 {q13}, [lr]
+
+ vext.8 q6, q8, q9, #14 // top left, top mid
+ vdup.8 d14, d28[0]
+ vext.8 q8, q8, q9, #15
+ vdup.8 d15, d28[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d16
+ vaddl.s16 q5, d3, d17
+
+ vext.8 q6, q9, q10, #1 // top mid, top right
+ vdup.8 d14, d28[3]
+ vext.8 q8, q9, q10, #2
+ vdup.8 d15, d28[4]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q11, q12, #14 // top left, top mid
+ vdup.8 d14, d28[5]
+ vext.8 q8, q11, q12, #15
+ vdup.8 d15, d28[6]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q12, q13, #1 // top mid, top right
+ vdup.8 d14, d29[0]
+ vext.8 q8, q12, q13, #2
+ vdup.8 d15, d29[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vdup.8 d14, d28[2]
+ vdup.8 d15, d28[7]
+
+ vmull.s8 q0, d18, d14
+ vmull.s8 q1, d19, d14
+ vmull.s8 q6, d24, d15
+ vmull.s8 q8, d25, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH
+ sub lr, r0, #1*GRAIN_WIDTH
+ vld1.8 {q9}, [r12] // load the previous block right above
+ vld1.8 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #13
+ vext.8 q11, q11, q11, #13
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ sub r12, r0, #3*GRAIN_WIDTH + 3
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d20, d26[0]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d26[1]
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vext.8 q8, q11, q12, #2
+ vdup.8 d20, d26[2]
+ vext.8 q9, q11, q12, #3
+ vdup.8 d21, d26[3]
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d14
+ vaddl.s16 q5, d3, d15
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #4
+ vdup.8 d20, d26[4]
+ vext.8 q7, q11, q12, #5
+ vdup.8 d21, d26[5]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ sub r12, r0, #2*GRAIN_WIDTH + 3
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #6
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d20, d26[6]
+ vdup.8 d21, d26[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d22, d21
+ vmull.s8 q7, d23, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #1
+ vdup.8 d20, d27[0]
+ vext.8 q7, q11, q12, #2
+ vdup.8 d21, d27[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #3
+ vdup.8 d20, d27[2]
+ vext.8 q9, q11, q12, #4
+ vdup.8 d21, d27[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ sub r12, r0, #1*GRAIN_WIDTH + 3
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #5
+ vdup.8 d20, d27[4]
+ vext.8 q7, q11, q12, #6
+ vdup.8 d21, d27[5]
+
+ vld1.8 {q11, q12}, [r12]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vdup.8 d20, d27[6]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d27[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #2
+ vdup.8 d20, d28[0]
+ vext.8 q7, q11, q12, #3
+ vdup.8 d21, d28[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #4
+ vdup.8 d20, d28[2]
+ vext.8 q9, q11, q12, #5
+ vdup.8 d21, d28[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #6
+ vdup.8 d20, d28[4]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+
+ vaddw.s16 q2, q2, d0
+ vaddw.s16 q3, q3, d1
+ vaddw.s16 q4, q4, d2
+ vaddw.s16 q5, q5, d3
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ push {r11,lr}
+1:
+ get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ subs r1, r1, #1
+ store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r11,lr}
+1:
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ subs r1, r1, #1
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.8 {q3}, [r11]!
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q8, q0, q15
+ bl get_gaussian_neon
+ vrshl.s16 q9, q0, q15
+ vqmovn.s16 d0, q8
+ vqmovn.s16 d1, q9
+
+ vand q3, q3, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ vst1.8 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+ push {r11,lr}
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ pop {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vld1.16 {q4, q5}, [r12]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vpaddl.s8 q4, q4
+ vpaddl.s8 q5, q5
+ vadd.i16 q2, q2, q4
+ vadd.i16 q3, q3, q5
+ vrshrn.s16 d4, q2, #2
+ vrshrn.s16 d5, q3, #2
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vrshrn.s16 d4, q2, #1
+ vrshrn.s16 d5, q3, #1
+
+add_coeff_lag0_start:
+ vand q3, q2, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ bx lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH
+ mov r1, r2
+ mul r12, r12, lr
+.endif
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #1
+ vneg.s16 q12, q12
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 16
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 64
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #2
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q9, q9, q10, q11
+ sum_\type\()_lag1 q10, q10, q11, q12
+ sum_\type\()_lag1 q12, q11, q12, q13, right
+ get_grain_2 d26
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+ vmov q11, q10
+ vmov q10, q9
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH-3
+ mov r1, r2
+ mul r12, r12, lr
+
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #7
+ vneg.s16 q12, q12
+
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add r12, r11, #GRAIN_WIDTH
+.endif
+ vmov q1, q13
+ vmov q0, q8
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, #255
+ vmov q0, q9
+ vmov q8, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, q14
+ vmov q0, q10
+ vmov q9, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov q10, q2
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q10, q9, q10, q11, right
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d14, d15, d16, d17, d20, d21
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ vmov.u8 r11, \src1[0+\off]
+ vmov.u8 r12, \src2[0+\off]
+ add r11, r11, r3
+ vmov.u8 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u8 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u8 r12, \src1[4+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u8 lr, \src2[4+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u8 r11, \src1[6+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u8 r12, \src2[6+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst3, \src1, \src3, 0
+ gather_interleaved \dst1, \dst3, \src1, \src3, 1
+ gather_interleaved \dst2, \dst4, \src2, \src4, 0
+ gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, 0
+ gather_interleaved d8, d9, d0, d1, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH // grain_lut stride
+
+ neg r4, r4
+ vdup.16 q13, r4 // -scaling_shift
+ cmp r8, #0
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i8 q14, #16
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #9 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.8 d14, d24[0]
+ vdup.8 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q2, q3}, [r6], r9 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r8], r9 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+ vld1.8 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d4, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d4, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d15
+ vmull.s8 q5, d21, d15
+ vmull.s8 q8, d22, d15
+ vmull.s8 q9, d23, d15
+ vmlal.s8 q4, d4, d14
+ vmlal.s8 q5, d5, d14
+ vmlal.s8 q8, d6, d14
+ vmlal.s8 q9, d7, d14
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q8, #5
+ vqrshrn.s16 d23, q9, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+
+ bl gather32_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q2, d8 // scaling
+ vmovl.u8 q3, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q2 // scaling * grain
+ vmul.i16 q9, q9, q3
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r7, r7, #1
+.if \oy
+ vdup.8 d14, d25[0]
+ vdup.8 d15, d25[1]
+.endif
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+ ldrd r8, r9, [sp, #116] // offsets, h
+ ldrd r10, r11, [sp, #124] // uv, is_id
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ add r10, r10, #FGD_UV_OFFSET
+ vld1.16 {d4[]}, [r12] // uv_luma_mult
+ vld1.16 {d4[2]}, [r10] // uv_offset
+ vld1.16 {d4[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg lr, lr // -scaling_shift
+
+ cmp r12, #0
+ vdup.16 q13, lr // -scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ vmov.i8 q14, #16
+ vmov.i8 q15, #240
+ beq 2f
+ // is_id
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ mov r10, #GRAIN_WIDTH // grain_lut stride
+
+ add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+
+.if \sy
+ vmov.i8 d6, #23
+ vmov.i8 d7, #22
+.else
+ vmov.i8 d6, #27
+ vmov.i8 d7, #17
+.endif
+
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8, q9}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmull.s8 q6, d22, d7
+ vmull.s8 q7, d23, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vmlal.s8 q6, d18, d6
+ vmlal.s8 q7, d19, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q6, #5
+ vqrshrn.s16 d23, q7, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if !\csfl
+ vld1.8 {q8, q9}, [r1, :128] // src
+ vmovl.u8 q4, d0
+ vmovl.u8 q5, d1
+ vmovl.u8 q6, d2
+ vmovl.u8 q7, d3
+ vmovl.u8 q0, d16
+ vmovl.u8 q1, d17
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q6, q6, d4[0]
+ vmul.i16 q7, q7, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vmul.i16 q8, q8, d4[1]
+ vmul.i16 q9, q9, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vqadd.s16 q6, q6, q8
+ vqadd.s16 q7, q7, q9
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vshr.s16 q6, q6, #6
+ vshr.s16 q7, q7, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vadd.i16 q6, q6, q0
+ vadd.i16 q7, q7, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6
+ vqmovun.s16 d3, q7
+.endif
+
+ bl gather32_neon
+
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r9, r9, #1
+.if \oy
+ vdup.8 d6, d25[0]
+ vdup.8 d7, d25[1]
+.endif
+
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10}, [r5], r10 // grain_lut
+ vld1.8 {q11}, [r1, :128], r2 // src
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if \csfl
+ vrshrn.u16 d0, q0, #1
+ vrshrn.u16 d1, q1, #1
+.else
+ vrshr.u16 q4, q0, #1
+ vrshr.u16 q5, q1, #1
+ vmovl.u8 q0, d22
+ vmovl.u8 q1, d23
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+.endif
+
+ bl gather16_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+
+ vaddw.u8 q8, q8, d22 // *src + noise
+ vaddw.u8 q9, q9, d23
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+
+ vmax.u8 q0, q0, q14
+ vmin.u8 q0, q0, q15
+
+ subs r9, r9, #1
+.if \oy
+ vswp d6, d7
+.endif
+ vst1.8 {q0}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S
new file mode 100644
index 0000000000..d10bffff2f
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@@ -0,0 +1,2137 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+function get_grain_4_neon
+ push {r11,lr}
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[2]}, [r11]
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #2
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.16 d1[3], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub r12, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+
+ vext.8 q0, q8, q9, #14 // top left, top mid
+ vext.8 q1, q9, q10, #2 // top left, top mid
+
+ vmull.s16 q2, d18, d28
+ vmlal.s16 q2, d0, d27
+ vmlal.s16 q2, d2, d29
+ vmull.s16 q3, d19, d28
+ vmlal.s16 q3, d1, d27
+ vmlal.s16 q3, d3, d29
+
+ vmov q8, q9
+ vmov q9, q10
+
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d12, d12, d13
+ vpadd.i16 d13, d14, d15
+ vadd.i16 q0, q0, q6
+ vpop {q6-q7}
+ vrshr.s16 q0, q0, #2
+.endif
+.ifc \type, uv_422
+ vld1.16 {q0, q1}, [r11]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vrshr.s16 q0, q0, #1
+.endif
+.ifc \type, uv_444
+ vld1.16 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+ vmovl.s8 q6, d13
+.endif
+ vmlal.s16 q2, d0, d13
+ vmlal.s16 q3, d1, d13
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s16 r10, d1[1]
+.endif
+.ifnc \lag, lag1
+ vmov.s16 r8, d1[2]
+.endif
+ vmov.s16 r6, d1[3]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r12, 11
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r12]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #2
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #14
+.endif
+ vst1.16 {q0}, [r0]!
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #1*GRAIN_WIDTH*2
+ vld1.8 {q9}, [r12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH*2 - 16
+ sub lr, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+ vld1.16 {q13}, [lr]
+
+ vdup.8 d10, d28[0]
+ vext.8 q0, q8, q9, #12 // top left, top mid
+ vdup.8 d12, d28[1]
+ vext.8 q1, q8, q9, #14
+ vdup.8 d14, d28[3]
+ vext.8 q4, q9, q10, #2 // top mid, top right
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmull.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmull.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d28[4]
+ vext.8 q0, q9, q10, #4 // top mid, top right
+ vdup.8 d12, d28[5]
+ vext.8 q1, q11, q12, #12 // top left, top mid
+ vdup.8 d14, d28[6]
+ vext.8 q4, q11, q12, #14
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d29[0]
+ vext.8 q0, q12, q13, #2 // top mid, top right
+ vdup.8 d12, d29[1]
+ vext.8 q1, q12, q13, #4
+
+ vdup.8 d14, d28[2]
+ vdup.8 d8, d28[7]
+
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q4, d8
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d18, d14
+ vmlal.s16 q2, d24, d8
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d19, d14
+ vmlal.s16 q3, d25, d8
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH*2
+ sub lr, r0, #1*GRAIN_WIDTH*2
+ vld1.16 {q9}, [r12] // load the previous block right above
+ vld1.16 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH*2
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #10
+ vext.8 q11, q11, q11, #10
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ movw r12, #(3*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d12, d26[0]
+ vext.8 q1, q11, q12, #2
+ vdup.8 d14, d26[1]
+ vext.8 q4, q11, q12, #4
+ vdup.8 d16, d26[2]
+ vext.8 q5, q11, q12, #6
+ vdup.8 d18, d26[3]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ movw r12, #(2*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+
+ vmull.s16 q2, d22, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmull.s16 q3, d23, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d26[4]
+ vext.8 q0, q11, q12, #8
+ vdup.8 d14, d26[5]
+ vext.8 q1, q11, q12, #10
+ vdup.8 d16, d26[6]
+ vext.8 q4, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d18, d26[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d22, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d23, d18
+
+ vdup.8 d12, d27[0]
+ vext.8 q0, q11, q12, #2
+ vdup.8 d14, d27[1]
+ vext.8 q1, q11, q12, #4
+ vdup.8 d16, d27[2]
+ vext.8 q4, q11, q12, #6
+ vdup.8 d18, d27[3]
+ vext.8 q5, q11, q12, #8
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ sub r12, r0, #(1*GRAIN_WIDTH + 3)*2
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d27[4]
+ vext.8 q0, q11, q12, #10
+ vdup.8 d14, d27[5]
+ vext.8 q1, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d16, d27[6]
+ vdup.8 d18, d27[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vext.8 q5, q11, q12, #2
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d22, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d23, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[0]
+ vext.8 q0, q11, q12, #4
+ vdup.8 d14, d28[1]
+ vext.8 q1, q11, q12, #6
+ vdup.8 d16, d28[2]
+ vext.8 q4, q11, q12, #8
+ vdup.8 d18, d28[3]
+ vext.8 q5, q11, q12, #10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[4]
+ vext.8 q0, q11, q12, #12
+ vmovl.s8 q6, d12
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q3, d1, d12
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #80
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_2 d0
+ subs r1, r1, #1
+ vst1.32 {d0[0]}, [r0]!
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #40
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_4 d0
+ subs r1, r1, #1
+ vst1.16 {d0}, [r0]
+ add r0, r0, #GRAIN_WIDTH*2-80
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.16 {q3}, [r11]!
+gen_grain_uv_lag0_8_start:
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+gen_grain_uv_lag0_8_add:
+ vand q3, q3, q1
+ vmull.s16 q2, d6, d22
+ vmull.s16 q3, d7, d22
+ vrshl.s32 q2, q2, q12
+ vrshl.s32 q3, q3, q12
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vqadd.s16 q2, q2, q0
+ vmin.s16 q2, q2, q9
+ vmax.s16 q2, q2, q10
+ vst1.16 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2,q3}, [r11]!
+ vld1.16 {q4,q5}, [r12]
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d8, d8, d9
+ vpadd.i16 d9, d10, d11
+ vadd.i16 q2, q2, q4
+ vrshr.s16 q3, q2, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ vld1.16 {q2,q3}, [r11]!
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vrshr.s16 q3, q2, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2}, [r11]
+ vld1.16 {q0}, [r12]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d0
+ vrshr.s16 d6, d4, #2
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ vld1.16 {q2}, [r11]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vrshr.s16 d6, d4, #1
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ ldr r4, [sp, #36]
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH*2
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+.else
+ clz lr, r2
+.endif
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr // bitdepth_min_8
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #2
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #4
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+.ifc \type, uv_444
+ vmovl.s8 q6, d13
+.endif
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+ ldr r4, [sp, #36]
+ mov r12, r3
+ movw r11, #(3*GRAIN_WIDTH-3)*2
+ mov lr, #28
+ add r11, r1, r11
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #14
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ vmov q1, q14
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+ vmovl.s8 q6, d13
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
+ vmov.u16 r11, \src1[0+\off]
+ vmov.u16 r12, \src3[0+\off]
+ add r11, r11, r3
+ vmov.u16 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u16 r11, \src3[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u16 r12, \src2[0+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u16 lr, \src4[0+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u16 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u16 r12, \src4[2+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, d2, d3, 0
+ gather_interleaved d8, d9, d0, d1, d2, d3, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, lsl #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH*2 // grain_lut stride
+ ldr r10, [sp, #124] // bitdepth_max
+
+ eor r4, r4, #15 // 15 - scaling_shift
+ vdup.16 q6, r10 // bitdepth_max
+ clz r10, r10
+ vdup.16 q13, r4 // 15 - scaling_shift
+ rsb r10, r10, #24 // bitdepth_min_8
+ cmp r8, #0
+ vdup.16 q12, r10 // bitdepth_min_8
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i16 q14, #16
+ vmov.i16 q15, #235
+ vshl.s16 q14, q14, q12
+ vshl.s16 q15, q15, q12
+ b 2f
+1:
+ // no clip
+ vmov.i16 q14, #0
+ vmov q15, q6
+2:
+ vshr.u16 q6, q6, #1 // grain_max
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #18 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.16 d14, d24[0]
+ vdup.16 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ sub r2, r2, #32 // src_stride -= 32
+ sub r9, r9, #32 // grain_stride -= 32
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r6]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r8], r9 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q4, q5}, [r6], r9 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r1, :128]! // src
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+.if !\oy
+ vmvn.i16 q5, #0xf000 // 0x0fff
+.endif
+ vld1.16 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vmlal.s16 q0, d16, d25
+.endif
+
+.if \oy
+.if \ox
+ add r8, r8, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vmvn d0, d12 // grain_min
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d16, d16, d12
+ vmin.s16 d4, d4, d12
+ vmax.s16 d16, d16, d0
+ vmax.s16 d4, d4, d0
+.endif
+
+ vmull.s16 q0, d4, d14
+ vmull.s16 q1, d5, d14
+ vmull.s16 q2, d6, d14
+ vmull.s16 q3, d7, d14
+ vmlal.s16 q0, d16, d15
+ vmlal.s16 q1, d17, d15
+ vmlal.s16 q2, d18, d15
+ vmlal.s16 q3, d19, d15
+ vmull.s16 q8, d20, d15
+ vmull.s16 q9, d21, d15
+ vmull.s16 q10, d22, d15
+ vmull.s16 q11, d23, d15
+ vmlal.s16 q8, d8, d14
+ vmlal.s16 q9, d9, d14
+ vmlal.s16 q10, d10, d14
+ vmlal.s16 q11, d11, d14
+ vmvn q4, q6 // grain_min
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q6
+ vmin.s16 q9, q1, q6
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 q10, q2, q6
+ vmin.s16 q11, q3, q6
+ vmax.s16 q8, q8, q4
+ vmax.s16 q9, q9, q4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vmvn.i16 q5, #0xf000 // 0x0fff
+ vmax.s16 q10, q10, q4
+ vmax.s16 q11, q11, q4
+.elseif \ox
+ vmvn d4, d12 // grain_min
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 d16, d16, d12
+ vmax.s16 d16, d16, d4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q5
+ vand q1, q1, q5
+ vand q2, q2, q5
+ vand q3, q3, q5
+
+ bl gather32_neon
+
+.if \ox || \oy
+ vpush {q6-q7}
+.endif
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+.if \ox || \oy
+ vpop {q6-q7}
+.endif
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+ subs r7, r7, #1
+.if \oy
+ vdup.16 d14, d25[0]
+ vdup.16 d15, d25[1]
+.endif
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r10, r11, [sp, #124] // uv, is_id
+ ldr r6, [sp, #136] // bitdepth_max
+
+ clz r7, r6
+ rsb r7, r7, #24 // bitdepth_min_8
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
+ vld1.16 {d30[]}, [r12] // uv_luma_mult
+ lsl r10, r10, r7 // uv_offset << bitdepth_min_8
+ vld1.16 {d30[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ eor lr, lr, #15 // 15 - scaling_shift
+
+ vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
+
+ cmp r12, #0
+ vdup.16 q13, lr // 15 - scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ mov r8, #16
+ mov r9, #240
+ lsl r8, r8, r7
+ lsl r9, r9, r7
+ beq 2f
+ // is_id
+ mov r9, #235
+ lsl r9, r9, r7
+ b 2f
+1:
+ // no clip
+ mov r8, #0
+ mov r9, r6 // bitdepth_max
+2:
+ vmov.16 d30[3], r6 // bitdepth_max
+ vdup.16 d31, r8 // clip_min
+
+ mov r10, #GRAIN_WIDTH*2 // grain_lut stride
+
+.if \sy
+ mov r6, #23
+ mov r7, #22
+.else
+ mov r6, #27
+ mov r7, #17
+.endif
+ vmov.16 d31[1], r9 // clip_max
+
+ ldrd r8, r9, [sp, #116] // offsets, h
+
+ add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+ vmov.16 d31[2], r6 // overlap y [0]
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ vmov.16 d31[3], r7 // overlap y [1]
+
+ add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+ sub r7, r7, #32 // luma_stride -= 32
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ sub r2, r2, #32 // src_stride -= 32
+ sub r10, r10, #32 // grain_stride -= 32
+.if \oy
+ mov r12, lr
+.endif
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if \oy
+ vld1.16 {q4, q5}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+ vld1.16 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ add r11, r11, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vmull.s16 q8, d20, d29
+ vmull.s16 q9, d21, d29
+ vmull.s16 q10, d22, d29
+ vmull.s16 q11, d23, d29
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vmlal.s16 q8, d8, d28
+ vmlal.s16 q9, d9, d28
+ vmlal.s16 q10, d10, d28
+ vmlal.s16 q11, d11, d28
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q7
+ vmin.s16 q9, q1, q7
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q10, q2, q7
+ vmin.s16 q11, q3, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 q10, q10, q6
+ vmax.s16 q11, q11, q6
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q4, q5}, [r1, :128]! // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d8, d29
+ vmlal.s16 q7, d9, d29
+ vmlal.s16 q0, d10, d29
+ vmlal.s16 q1, d11, d29
+ vld1.16 {q4, q5}, [r1, :128] // src
+ sub r1, r1, #32
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+ vmlal.s16 q0, d8, d29
+ vmlal.s16 q1, d9, d29
+ vmlal.s16 q2, d10, d29
+ vmlal.s16 q3, d11, d29
+ vdup.16 q14, d30[2] // uv_offset
+ vshrn.s32 d0, q0, #6
+ vshrn.s32 d1, q1, #6
+ vshrn.s32 d2, q2, #6
+ vshrn.s32 d3, q3, #6
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vadd.i16 q2, q0, q14
+ vadd.i16 q3, q1, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmin.s16 q2, q2, q4
+ vmin.s16 q3, q3, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+ vmax.s16 q2, q2, q5
+ vmax.s16 q3, q3, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+ vand q2, q2, q14
+ vand q3, q3, q14
+.endif
+
+ bl gather32_neon
+
+ vld1.16 {q0, q1}, [r1, :128]! // src
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+.if \oy
+ vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmax.s16 q2, q2, q4
+ vmax.s16 q3, q3, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+ vmin.s16 q2, q2, q5
+ vmin.s16 q3, q3, q5
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+
+ subs r9, r9, #1
+.if \oy
+ vmov.32 d31[1], lr // new coeffs for overlap y
+.endif
+
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5], r10 // grain_lut
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d17, q1, #5
+ vqrshrn.s32 d18, q2, #5
+ vqrshrn.s32 d19, q3, #5
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q8, q8, q7
+ vmin.s16 q9, q9, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vrshr.u16 q0, q0, #1
+ vrshr.u16 q1, q1, #1
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d4, d29
+ vmlal.s16 q7, d5, d29
+ vmlal.s16 q0, d6, d29
+ vmlal.s16 q1, d7, d29
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vdup.16 q14, d30[2] // uv_offset
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+.endif
+
+ bl gather16_neon
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q2, q8 // *src + noise
+ vqadd.s16 q1, q3, q9
+
+.if \oy
+ // Swap the two last coefficients of d31, place them first in d28
+ vrev64.16 d28, d31
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+
+ subs r9, r9, #1
+.if \oy
+ // Take the first two 16 bit coefficients of d28 and place them at the
+ // end of d31
+ vtrn.32 d31, d28
+.endif
+
+ vst1.16 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred.S b/third_party/dav1d/src/arm/32/ipred.S
new file mode 100644
index 0000000000..8c6d539a47
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred.S
@@ -0,0 +1,2958 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * Copyright © 2019, B Krishnan Iyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ ldr r3, [r2, r3, lsl #2]
+ vmov.i8 q0, #128
+ add r2, r2, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov.i8 q1, #128
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov.i8 q1, #128
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #1
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs lr, lr, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.8 {d0}, [r2]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.8 {q0}, [r2]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.8 {q0, q1}, [r2]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.8 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.8 {q2, q3}, [r2]
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #4
+ mov lr, #-4
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_h_tbl) + CONFIG_THUMB
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d2[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.8 {d3}, [r0, :64], r1
+ vst1.8 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d1}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ add r2, r2, #3
+ mov lr, #-1
+16:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128], r1
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #16
+32:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #48
+64:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #1
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d0, d0[0]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 d0, d0[0]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.8 {d0, d1}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.8 {d0, d1, d2, d3}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d4, q0, #5
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2]
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d18, q0, #6
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.8 {d0, d1}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w16)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #5
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ vmov.8 q1, q0
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #6
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ vmov.8 q1, q0
+ sub r1, r1, #32
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.16 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u16 q15, q15, #1 // (width + height) >> 1
+ vdup.16 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.s16 d0, d0, d1
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d0, d0, d30
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.8 {d2}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d2, d2
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #8
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.8 {d0, d1}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.8 {d2, d3}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #16
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 q0, d0[0]
+2:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.8 {d2, d3, d4, d5}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vadd.u16 q1, q1, q2
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #32
+ vadd.s16 d0, d0, d2
+ vshl.u16 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d4, d4, d24
+1:
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]!
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.8 {d2, d3, d4, d5}, [r2]!
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d4, d4, d5
+ vadd.u16 d2, d2, d3
+ vld1.8 {d16, d17, d18, d19}, [r2]
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vaddl.u8 q8, d16, d17
+ vaddl.u8 q9, d18, d19
+ vadd.u16 d16, d16, d17
+ vadd.u16 d18, d18, d19
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vadd.u16 d2, d2, d4
+ vadd.u16 d3, d16, d18
+ cmp r4, #64
+ vadd.s16 d0, d0, d2
+ vadd.s16 d0, d0, d3
+ vshl.u16 d18, d0, d28
+ beq 1f
+ // h = 16/32
+ movw lr, #(0x5556/2)
+ movt lr, #(0x3334/2)
+ and r5, r4, #31
+ lsr lr, lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d18, d18, d30
+1:
+ sub r1, r1, #32
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ clz lr, r3
+ adr r5, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[], d5[]}, [r2]
+ add r8, r2, #1
+ sub r2, r2, #4
+ add r5, r5, lr
+ mov r7, #-4
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[], d7[]}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vzip.32 d0, d1
+ vzip.32 d2, d3
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d2
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vmov d1, d2
+ vabd.u8 q10, q3, q9 // tdiff
+ vabd.u8 q11, q2, q9 // tldiff
+ vabd.u8 q9, q0, q9 // ldiff
+ vmin.u8 q12, q10, q11 // min(tdiff, tldiff)
+ vcge.u8 q10, q11, q10 // tldiff >= tdiff
+ vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff
+ vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbit q10, q0, q9 // ldiff <= min ? left : ...
+ vst1.32 {d21[1]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d20[1]}, [r0, :32], r1
+ vst1.32 {d20[0]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d24}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d23}, [r0, :64], r1
+ vst1.8 {d22}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vld1.8 {d6}, [r8]!
+ mov r12, r3
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+1:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+2:
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.8 {d25}, [r0, :64]!
+ vst1.8 {d24}, [r6, :64]!
+ vst1.8 {d23}, [r5, :64]!
+ vst1.8 {d22}, [lr, :64]!
+ ble 8f
+ vld1.8 {d6}, [r8]!
+ b 2b
+8:
+ subs r4, r4, #4
+ ble 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ vld1.8 {d6}, [r8]!
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.8 {d4[]}, [lr] // bottom
+ add r8, r2, #1
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d16[]}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.8 q3, d16[3] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vzip.32 d1, d0 // left, flipped
+ vzip.32 d3, d2
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q0, d1, d6 // left-right
+ vsubl.u8 q1, d3, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q0, q9 // (left flipped)
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.8 {d16}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #2
+ mov r7, #-2
+ vdup.8 q3, d16[7] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+8:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r4, r4, #2
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3
+ sub r2, r2, #2
+ mov r7, #-2
+ vld1.8 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3
+ mov r9, r3
+
+1:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d16}, [r8]! // top
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r3, r3, #8
+ vst1.8 {d24}, [r0, :64]!
+ vst1.8 {d25}, [r6, :64]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // bottom
+ add r2, r2, #1
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vshll.i8 q10, d4, #8 // bottom*256
+ vshll.i8 q11, d4, #8
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ subs r4, r4, #4
+ vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q11, q3, q9
+ vrshrn.i16 d20, q10, #8
+ vrshrn.i16 d21, q11, #8
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r6, :32], r1
+ vst1.32 {d21[0]}, [r0, :32], r1
+ vst1.32 {d21[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.8 {d6}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.i8 q12, d4, #8 // bottom*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q3, q9
+ vmla.i16 q14, q3, q10
+ vmla.i16 q15, q3, q11
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vmovl.u8 q4, d8 // weights_ver
+ vmovl.u8 q5, d10
+ vmovl.u8 q6, d12
+ vmovl.u8 q7, d14
+2:
+ vld1.8 {q3}, [r2]! // top
+ vshll.i8 q8, d4, #8 // bottom*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vsubl.u8 q0, d6, d4 // top-bottom
+ vsubl.u8 q1, d7, d4
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q9, q1, q4
+ vmla.i16 q10, q0, q5
+ vmla.i16 q11, q1, q5
+ vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q1, q6
+ vmla.i16 q14, q0, q7
+ vmla.i16 q15, q1, q7
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vzip.32 d3, d2 // left, flipped
+ vzip.32 d1, d0
+ vsubl.u8 q1, d3, d4 // left-right
+ vsubl.u8 q0, d1, d4
+ subs r4, r4, #4
+ vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q0, q3
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left
+ vshll.i8 q12, d4, #8 // right*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q11, d22, d4 // left-right
+ vsubl.u8 q10, d20, d4
+ vsubl.u8 q9, d18, d4
+ vsubl.u8 q8, d16, d4
+ vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q10, q3 // (left flipped)
+ vmla.i16 q14, q9, q3
+ vmla.i16 q15, q8, q3
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #4
+ mov r7, #-4
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left
+ vsubl.u8 q4, d8, d4 // left-right
+ vsubl.u8 q5, d10, d4
+ vsubl.u8 q6, d12, d4
+ vsubl.u8 q7, d14, d4
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vmovl.u8 q0, d2 // weights_hor
+ vmovl.u8 q1, d3
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q7, q1 // (left flipped)
+ vmla.i16 q10, q6, q0
+ vmla.i16 q11, q6, q1
+ vmla.i16 q12, q5, q0
+ vmla.i16 q13, q5, q1
+ vmla.i16 q14, q4, q0
+ vmla.i16 q15, q4, q1
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ push {r4-r8, lr}
+ movw r12, #511
+ ldrd r4, r5, [sp, #24]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ add r8, r2, #1
+ sub r2, r2, #2
+ mov r7, #-2
+ bx r5
+
+ .align 2
+L(ipred_filter_tbl):
+ .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d0[]}, [r8] // top (0-3)
+ vmovl.u8 q0, d0 // top (0-3)
+4:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d4, q2, #4
+ subs r4, r4, #2
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vmovl.u8 q0, d4
+ vst1.32 {d4[1]}, [r6, :32], r1
+ vmov d0, d1 // move top from [4-7] to [0-3]
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d0}, [r8] // top (0-7)
+ vmovl.u8 q0, d0 // top (0-7)
+8:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d4, q2, #4
+ vmovl.u8 q1, d4 // first block, in 16 bit
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d5, q3, #4
+ vzip.32 d4, d5
+ subs r4, r4, #2
+ vst1.8 {d4}, [r0, :64], r1
+ vmovl.u8 q0, d5
+ vst1.8 {d5}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+ vpush {q4-q5}
+ sub r1, r1, r3
+ mov lr, r3
+
+1:
+ vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmovl.u8 q0, d0 // left (0-1) + topleft (2)
+2:
+ vld1.8 {q2}, [r8]! // top(0-15)
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmovl.u8 q1, d4 // top(0-7)
+ vmovl.u8 q2, d5 // top(8-15)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d6, q3, #4
+ vmovl.u8 q0, d6 // first block, in 16 bit
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d7, q4, #4
+ vmovl.u8 q0, d7 // second block, in 16 bit
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d8, q5, #4
+ vmovl.u8 q0, d8 // third block, in 16 bit
+ vmov.u8 r12, d5[6]
+ vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.8 d0[4], r12
+
+ subs r3, r3, #16
+ vqrshrun.s16 d9, q15, #4
+
+ vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]!
+ vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]!
+ ble 8f
+ vmov.u8 r12, d9[7]
+ vmov.8 d0[0], r12
+ vmov.u8 r12, d9[3]
+ vmov.8 d0[2], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q5}
+ pop {r4-r8, pc}
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #12]
+ vld1.8 {d0}, [r2, :64]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ vmov.i8 q15, #7
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {d2}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 d3, d2, #4
+ vand.u8 d2, d2, d30
+ vzip.8 d2, d3
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d2[1]}, [r2, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d3[1]}, [r2, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 q2, q1, #4
+ vand.u8 q1, q1, q15
+ vzip.8 q1, q2
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.8 {d2}, [r0, :64], r1
+ vtbl.8 d4, {d0}, d4
+ vst1.8 {d3}, [r2, :64], r1
+ vtbl.8 d5, {d0}, d5
+ vst1.8 {d4}, [r0, :64], r1
+ vst1.8 {d5}, [r2, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #4
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vst1.8 {q9}, [r2, :128], r1
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10}, [r0, :128], r1
+ vst1.8 {q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+32:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #2
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #32
+64:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #1
+ vand.u8 q8, q10, q15
+ vshr.u8 q9, q10, #4
+ vand.u8 q10, q11, q15
+ vshr.u8 q11, q11, #4
+ vzip.8 q8, q9
+ vzip.8 q10, q11
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128]!
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q0, #128 // dc
+ vdup.i16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q2, q3}, [r5, :128]!
+ vmul.i16 q2, q2, q1 // diff = ac * alpha
+ vmul.i16 q3, q3, q1
+ vshr.s16 q8, q2, #15 // sign = diff >> 15
+ vshr.s16 q9, q3, #15
+ vadd.i16 q2, q2, q8 // diff + sign
+ vadd.i16 q3, q3, q9
+ vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q3, q3, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d5, q3
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vst1.32 {d4[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d5[0]}, [r0, :32], r1
+ vst1.32 {d5[1]}, [r6, :32], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vld1.16 {q10, q11}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vst1.8 {d16}, [r0, :64], r1
+ vst1.8 {d17}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d18}, [r0, :64], r1
+ vst1.8 {d19}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3
+ mov lr, r3
+1:
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vld1.16 {q10, q11}, [r12, :128]!
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ subs r3, r3, #16
+ vst1.16 {q8}, [r0, :128]!
+ vst1.16 {q9}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #1
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.8 {q0}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.8 {q2, q3}, [r2]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ clz lr, r3
+ clz r8, r4
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.8 {q0}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.8 {q2, q3}, [r2, :128]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.16 d16, r8 // width + height
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u16 d16, d16, #1 // (width + height) >> 1
+ vdup.16 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.8 {d1}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.i16 d1, d1
+ vpadd.i16 d1, d1
+ cmp r4, #8
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.8 {q0}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.8 {q2}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #16
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.8 {q2, q3}, [r2, :128]!
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.8 {q2, q3}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q2, q2, q3
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #32
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d2}, [r12, :64], r2
+ vld1.8 {d1}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q8, q8, q1
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i16 q0, q8, q9
+ vadd.i16 q1, q10, q11
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
+ vadd.i32 q0, q1
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+L(ipred_cfl_ac_420_w4_subtract_dc):
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q2, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d1}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q12, q13}, [r1, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q12, q12
+ vpaddl.u8 q13, q13
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ vshl.i16 q2, q12, #1
+ vshl.i16 q3, q13, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vldr d26, [r1, #16]
+ vpaddl.u8 q0, q0
+ vld1.8 {q12}, [r1, :128], r2
+ vpaddl.u8 d6, d6
+ vldr d30, [r12, #16]
+ vpaddl.u8 q2, q2
+ vld1.8 {q14}, [r12, :128], r2
+ vpaddl.u8 d26, d26
+ vpaddl.u8 q12, q12
+ vpaddl.u8 d30, d30
+ vpaddl.u8 q14, q14
+ vadd.i16 d2, d2, d6
+ vadd.i16 q0, q0, q2
+ vadd.i16 d26, d26, d30
+ vadd.i16 q12, q12, q14
+ vshl.i16 d2, d2, #1
+ vshl.i16 q0, q0, #1
+ vshl.i16 d6, d26, #1
+ vshl.i16 q2, q12, #1
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q2, q2, #1
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d4}, [r1, :64], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {d5}, [r12, :64], r2
+ vpaddl.u8 q2, q2
+ vadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d5
+ vshl.i16 d0, d0, #1
+ vshl.i16 d4, d4, #1
+ vdup.16 q1, d0[3]
+ vdup.16 q3, d4[3]
+ vdup.16 d1, d0[3]
+ vdup.16 d5, d4[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vdup.16 d7, d3[3]
+ vmov d6, d3
+ vdup.16 d5, d2[3]
+ vmov d4, d2
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 d6, d6
+ vpaddl.u8 q2, q2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q0, q0, #2
+ vshl.i16 d6, d6, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.32 {d0[]}, [r1, :32], r2
+ vld1.32 {d0[1]}, [r12, :32], r2
+ vld1.32 {d2[]}, [r1, :32], r2
+ vld1.32 {d2[1]}, [r12, :32], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q1, d2, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d4}, [r1, :64], r2
+ vshll.u8 q0, d0, #3
+ vld1.16 {d6}, [r12, :64], r2
+ vshll.u8 q1, d2, #3
+ vshll.u8 q2, d4, #3
+ vshll.u8 q3, d6, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q3}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d4}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q2, d4, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ vld1.8 {q2, q3}, [r1, :128], r2
+ vld1.8 {q13, q14}, [r12, :128], r2
+ vshll.u8 q0, d4, #3
+ vshll.u8 q1, d5, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vshll.u8 q1, d29, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ vldr d4, [r1, #16]
+ vld1.8 {q1}, [r1, :128], r2
+ vldr d28, [r12, #16]
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q2, d4, #3
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vdup.16 q1, d1[3]
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q0, d27[3]
+ vdup.16 q1, d27[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d24}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q12, d24, #3
+ subs r8, r8, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q13, d25[3]
+ vdup.16 q0, d25[3]
+ vdup.16 q1, d25[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ vpaddl.u16 q0, q8
+ vpaddl.u16 q1, q9
+ vpaddl.u16 q2, q10
+ vpaddl.u16 q3, q11
+ vadd.i32 q0, q0, q1
+ vadd.i32 q2, q2, q3
+ vadd.i32 q0, q0, q2
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+ b L(ipred_cfl_ac_420_w4_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred16.S b/third_party/dav1d/src/arm/32/ipred16.S
new file mode 100644
index 0000000000..fa78049768
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred16.S
@@ -0,0 +1,3276 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, B Krishnan Iyer
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #24]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ vdup.16 q0, r12
+ ldr r3, [r2, r3, lsl #2]
+ add r12, r0, r1
+ vrshr.u16 q0, q0, #1
+ add r2, r2, r3
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vmov q1, q0
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov q1, q0
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov q1, q0
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #2
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.16 {q0}, [r2]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.16 {q0, q1}, [r2]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.16 {q2, q3}, [r2]
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #96
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]!
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r12, :128]!
+ subs lr, lr, #2
+ vst1.16 {d16, d17, d18, d19}, [r0, :128]!
+ vst1.16 {d16, d17, d18, d19}, [r12, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r0, :128], r1
+ vst1.16 {d20, d21, d22, d23}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #2
+ mov lr, #-2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_h_tbl) + CONFIG_THUMB
+40:
+ sub r2, r2, #6
+ mov lr, #-8
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128], r1
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ sub r1, r1, #16
+16:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ sub r1, r1, #48
+32:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #2
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vmov q1, q0
+ vmov q3, q2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 d0, d0[0]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.16 {d0, d1}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.16 {d0, d1, d2, d3}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d4, d0, #4
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d18, q0, #5
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d18, q0, #6
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4, lsl #1
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.16 {d0, d1}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ sub r1, r1, #32
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d0, q0, #6
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ sub r1, r1, #96
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4, lsl #1
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.32 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u32 q15, q15, #1 // (width + height) >> 1
+ vdup.32 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.16 {d2}, [r2]
+ vadd.i32 d0, d0, d30
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #4
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 d0, d0[0]
+2:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.16 {d0, d1}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.16 {d2, d3}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #8
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+2:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.16 {d2, d3, d4, d5}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 q1, q1, q2
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d1
+ vpaddl.u16 d2, d2
+ cmp r4, #16
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]
+ vadd.i16 q1, q1, q2
+ vadd.i16 q8, q8, q9
+ vadd.i16 q1, q1, q8
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #32
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #32
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+L(ipred_dc_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]!
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ add r2, r2, #2
+ vpadd.i32 d0, d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q1, q1, q2
+ vld1.16 {d20, d21, d22, d23}, [r2]!
+ vadd.i16 q8, q8, q9
+ vld1.16 {d24, d25, d26, d27}, [r2]!
+ vadd.i16 q10, q10, q11
+ vadd.i16 q12, q12, q13
+ vadd.i16 q1, q1, q8
+ vadd.i16 q10, q10, q12
+ vadd.i16 q1, q1, q10
+ vadd.i16 d2, d2, d3
+ vpaddl.u16 d2, d2
+ vpadd.i32 d2, d2, d2
+ cmp r4, #64
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 16/32
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #96
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ push {r4-r6, lr}
+ vpush {q4}
+ ldr r4, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r2]
+ add r6, r2, #2
+ sub r2, r2, #4
+ add r12, r12, lr
+ mov r5, #-4
+ add lr, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ sub r2, r2, #4
+ mov r5, #-8
+ vld1.16 {d6}, [r6]
+ vsub.i16 d16, d6, d4 // top - topleft
+ vmov d7, d6
+ vmov d17, d16
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d24}, [lr, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d23}, [r0, :64], r1
+ vst1.16 {d22}, [lr, :64], r1
+ bgt 4b
+ vpop {q4}
+ pop {r4-r6, pc}
+80:
+160:
+320:
+640:
+ vld1.16 {q3}, [r6]!
+ mov r12, r3
+ sub r1, r1, r3, lsl #1
+1:
+ vld2.16 {d0[], d2[]}, [r2, :32], r5
+ vmov d1, d0
+ vmov d3, d2
+2:
+ vsub.i16 q8, q3, q2 // top - topleft
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q11}, [lr, :128]!
+ ble 8f
+ vld1.16 {q3}, [r6]!
+ b 2b
+8:
+ subs r4, r4, #2
+ ble 9f
+ // End of horizontal loop, move pointers to next two rows
+ sub r6, r6, r12, lsl #1
+ add r0, r0, r1
+ add lr, lr, r1
+ vld1.16 {q3}, [r6]!
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4}
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4, lsl #1
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.16 {d4[], d5[]}, [lr] // bottom
+ add r8, r2, #2
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d16}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vdup.16 q3, d16[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d19, d4, d6 // bottom+right
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.u16 q12, d19, #8 // (bottom+right)*256
+ vshll.u16 q13, d19, #8
+ vshll.u16 q14, d19, #8
+ vshll.u16 q15, d19, #8
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vsub.i16 q1, q1, q3 // left-right
+ vsub.i16 q0, q0, q3
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d2, d18 // (left flipped)
+ vmlal.s16 q14, d1, d18
+ vmlal.s16 q15, d0, d18
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d16, d21
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d16, d23
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d27}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.16 {q8}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.16 q3, d17[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d3, d4, d6 // bottom+right
+8:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r4, r4, #2
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q13}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3, lsl #1
+ sub r2, r2, #4
+ mov r7, #-4
+ vld1.16 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3, lsl #1
+ mov r9, r3
+ vadd.i16 d3, d4, d6 // bottom+right
+
+1:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vld1.16 {q8}, [r8]! // top
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q13}, [r6, :128]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9, lsl #1
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // bottom
+ add r2, r2, #2
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d6}, [r2] // top
+ vsub.i16 d6, d6, d4 // top-bottom
+ vmov d7, d6
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q11, q3, q9
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {d20}, [r0, :64], r1
+ vst1.16 {d21}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d22}, [r0, :64], r1
+ vst1.16 {d23}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.16 {q3}, [r2] // top
+ vsub.i16 q3, q3, q2 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vshll.u8 q10, d20, #7
+ vshll.u8 q11, d22, #7
+ vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q3, q9
+ vqrdmulh.s16 q10, q3, q10
+ vqrdmulh.s16 q11, q3, q11
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ subs r4, r4, #4
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vshll.u8 q4, d8, #7 // weights_ver << 7
+ vshll.u8 q5, d10, #7
+ vshll.u8 q6, d12, #7
+ vshll.u8 q7, d14, #7
+2:
+ vld1.16 {q0, q1}, [r2]! // top
+ vsub.i16 q0, q0, q2 // top-bottom
+ vsub.i16 q1, q1, q2
+ vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ subs r3, r3, #16
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vsub.i16 q0, q0, q2 // left-right
+ vsub.i16 q1, q1, q2
+ subs r4, r4, #4
+ vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q0, q3 // (left flipped)
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d16}, [r6, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+ vst1.16 {d18}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+8:
+ vld1.16 {d23}, [r2, :64], r7 // left
+ subs r4, r4, #4
+ vsub.i16 d23, d23, d4 // left-right
+ vdup.16 q8, d23[3] // flip left
+ vdup.16 q9, d23[2]
+ vdup.16 q10, d23[1]
+ vdup.16 q11, d23[0]
+ vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q9, q3
+ vqrdmulh.s16 q10, q10, q3
+ vqrdmulh.s16 q11, q11, q3
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #8
+ mov r7, #-8
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld1.16 {d15}, [r2, :64], r7 // left
+ vsub.i16 d15, d15, d4 // left-right
+ vdup.16 q4, d15[3] // flip left
+ vdup.16 q5, d15[2]
+ vdup.16 q6, d15[1]
+ vdup.16 q7, d15[0]
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ subs r3, r3, #16
+ vshll.u8 q0, d2, #7 // weights_hor << 7
+ vshll.u8 q1, d3, #7
+ vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon, export=1
+ movw r12, #511
+ ldrd r4, r5, [sp, #88]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter\bpc\()_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ mov r7, #-4
+ vdup.16 q15, r8
+ add r8, r2, #2
+ sub r2, r2, #4
+.if \bpc == 10
+ vmov.i16 q7, #0
+.endif
+ bx r5
+
+ .align 2
+L(ipred_filter\bpc\()_tbl):
+ .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r8] // top (0-3)
+4:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vqrshrun.s32 d5, q3, #4
+.endif
+ vmin.s16 q2, q2, q15
+ subs r4, r4, #2
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ vmov d0, d5 // move top from [4-7] to [0-3]
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+80:
+ vld1.16 {q0}, [r8] // top (0-7)
+8:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+ vmin.s16 q2, q2, q15
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d5, q3, #4
+ vmin.s16 q2, q2, q15
+ vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6)
+ vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d6, q4, #4
+ vqrshrun.s32 d7, q5, #4
+.endif
+ vmin.s16 q3, q3, q15
+ vswp d5, d6
+ subs r4, r4, #2
+ vst1.16 {q2}, [r0, :128], r1
+ vmov q0, q3
+ vst1.16 {q3}, [r6, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+160:
+320:
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+
+1:
+ vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2)
+2:
+ vld1.16 {q1, q2}, [r8]! // top(0-15)
+.if \bpc == 10
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+ vmin.s16 q3, q3, q15
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q4, q4, #4
+ vmax.s16 q4, q4, q7
+ vmin.s16 q4, q4, q15
+ vmov q0, q4
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q5, q5, #4
+ vmax.s16 q5, q5, q7
+ vmin.s16 q5, q5, q15
+ vmov q0, q5
+ vmov.u16 r12, d5[3]
+ vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ subs r3, r3, #16
+ vrshr.s16 q6, q6, #4
+.else
+ vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4)
+ vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4)
+ vqrshrun.s32 d6, q3, #4
+ vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2)
+ vqrshrun.s32 d7, q4, #4
+ vmin.s16 q3, q3, q15
+ vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6)
+ vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d8, q5, #4
+ vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d9, q6, #4
+ vmin.s16 q0, q4, q15
+ vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q4, q4, q15
+ vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d10, q7, #4
+ vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d11, q6, #4
+ vmin.s16 q0, q5, q15
+ vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q5, q5, q15
+ vmov.u16 r12, d5[3]
+ vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ vqrshrun.s32 d12, q1, #4
+ subs r3, r3, #16
+ vqrshrun.s32 d13, q7, #4
+.endif
+ vswp q4, q5
+.if \bpc == 10
+ vmax.s16 q6, q6, q7
+.endif
+ vswp d7, d10
+ vmin.s16 q6, q6, q15
+
+ vswp d9, d12
+
+ vst1.16 {q3, q4}, [r0, :128]!
+ vst1.16 {q5, q6}, [r6, :128]!
+ ble 8f
+ vmov.u16 r12, d13[3]
+ vmov.16 d0[0], r12
+ vmov.u16 r12, d9[3]
+ vmov.16 d0[1], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ push {r4-r8, lr}
+ vpush {q4-q7}
+ movw r12, 0x3ff
+ ldr r8, [sp, #104]
+ cmp r8, r12
+ ble ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ vld1.16 {q14}, [r2, :128]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ vmov.i8 q13, #7
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q15, #0x100
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {d2}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 d3, d2, #4
+ vand.u8 d2, d2, d26
+ vzip.8 d2, d3
+ // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vzip.8 q0, q1
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vtbl.8 d1, {q14}, d1
+ vst1.16 {d0}, [r0, :64], r1
+ vtbl.8 d2, {q14}, d2
+ vst1.16 {d1}, [r2, :64], r1
+ vtbl.8 d3, {q14}, d3
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r2, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1}, [r3, :64]!
+ subs r5, r5, #4
+ vshr.u8 q2, q1, #4
+ vand.u8 q1, q1, q13
+ vzip.8 q1, q2
+ // Prefer doing the adds twice, instead of chaining a vmov after
+ // the add.
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vadd.i8 q3, q2, q2
+ vadd.i8 q2, q2, q2
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q2, q2, q15
+ vtbl.8 d1, {q14}, d1
+ vadd.i16 q3, q3, q15
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vst1.16 {q0}, [r0, :128], r1
+ vtbl.8 d6, {q14}, d6
+ vst1.16 {q1}, [r2, :128], r1
+ vtbl.8 d7, {q14}, d7
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r2, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #4
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r2, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128], r1
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+ sub r1, r1, #32
+32:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #2
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r2, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.8 {q10, q11}, [r3, :64]!
+ subs r5, r5, #1
+ vand.u8 q2, q10, q13
+ vshr.u8 q3, q10, #4
+ vand.u8 q10, q11, q13
+ vshr.u8 q11, q11, #4
+ vzip.8 q2, q3
+ vzip.8 q10, q11
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128]!
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vrshr.u16 q0, q15, #1
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ subs r4, r4, #2
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r6, :128], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ vpush {q4-q7}
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+1:
+ vld1.16 {q6, q7}, [r5, :128]!
+ vmull.s16 q2, d12, d2 // diff = ac * alpha
+ vld1.16 {q8, q9}, [r12, :128]!
+ vmull.s16 q3, d13, d3
+ vmull.s16 q4, d14, d2
+ vmull.s16 q5, d15, d3
+ vmull.s16 q6, d16, d2
+ vmull.s16 q7, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q4, #31
+ vshr.s32 q13, q5, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vshr.s32 q10, q6, #31
+ vadd.i32 q3, q3, q11
+ vshr.s32 q11, q7, #31
+ vadd.i32 q4, q4, q12
+ vshr.s32 q12, q8, #31
+ vadd.i32 q5, q5, q13
+ vshr.s32 q13, q9, #31
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q4, #6
+ vrshrn.i32 d7, q5, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vrshrn.i32 d8, q6, #6
+ vrshrn.i32 d9, q7, #6
+ vadd.i16 q3, q3, q0
+ vrshrn.i32 d10, q8, #6
+ vrshrn.i32 d11, q9, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q4, q4, q14
+ vmax.s16 q5, q5, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q4, q4, q15
+ vmin.s16 q5, q5, q15
+ subs r3, r3, #16
+ vst1.16 {q2, q3}, [r0, :128]!
+ vst1.16 {q4, q5}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #2
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.16 {q0}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.16 {q2, q3}, [r2]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ clz lr, r3
+ clz r8, r4
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.16 {q0}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.16 {q2, q3}, [r2, :128]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.32 d16, r8 // width + height
+ vdup.16 q15, r7 // bitdepth_max
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u32 d16, d16, #1 // (width + height) >> 1
+ vdup.32 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.16 {d1}, [r2]
+ vadd.i32 d0, d0, d16
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #4
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.16 {q0}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.16 {q2}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #8
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.16 {q2, q3}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 q2, q2, q3
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #16
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q2, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.16 {q2, q3}, [r2]!
+ vadd.i32 d0, d0, d16
+ vld1.16 {q10, q11}, [r2]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q2, q2, q10
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #32
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i32 q8, #0
+ vmov.i32 q9, #0
+ vmov.i32 q10, #0
+ vmov.i32 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i32 q8, q8, q9
+ vadd.i32 q10, q10, q11
+ vadd.i32 q0, q8, q10
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q2, q2, q12
+ vadd.i16 q3, q3, q13
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vadd.i16 q2, q2, q12
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vshl.i16 q0, q0, #1
+ vshl.i16 d2, d2, #1
+ subs r8, r8, #1
+ vdup.16 d3, d2[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q12}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vpadd.i16 d0, d0, d1
+ vshl.i16 d0, d0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d0[3]
+ vdup.16 d1, d0[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d26, d27
+ vpadd.i16 d26, d4, d5
+ vpadd.i16 d27, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q12, #2
+ vshl.i16 q3, q13, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d4, d5
+ vshl.i16 q0, q0, #2
+ vshl.i16 q12, q12, #2
+ vdup.16 d7, d25[3]
+ vmov d6, d25
+ vdup.16 d5, d24[3]
+ vmov d4, d24
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vpadd.i16 d7, d26, d27
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vshl.i16 q0, q0, #2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 d6, d6, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d1}, [r12, :64], r2
+ vld1.16 {d2}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q2, q2, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ asr r2, r2, #1
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ sub r2, r2, #32
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ sub r2, r2, #32
+1: // Copy and expand input, padding 8
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q3, d5[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.16 {q0}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
new file mode 100644
index 0000000000..ceea025e45
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -0,0 +1,3343 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlal.s16 \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlsl.s16 \d1, \s3, \c1
+.endm
+
+.macro vqrshrn_8h d0, d1, s0, s1, shift
+ vqrshrn.s32 \d0, \s0, \shift
+ vqrshrn.s32 \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s16 \r0, \r0, \c
+ vqrdmulh.s16 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s16 \r2, \r2, \c
+ vqrdmulh.s16 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s16 \r4, \r4, \c
+ vqrdmulh.s16 \r5, \r5, \c
+ vqrdmulh.s16 \r6, \r6, \c
+ vqrdmulh.s16 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.8 {\load}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.8 {\store}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store d3, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits
+ load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits
+ load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits
+ load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits
+ load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits
+ load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits
+ load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits
+ load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits
+ load_add_store , , , , , , d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src
+ load_add_store d3, q9, , , , , , \dst, \src
+ load_add_store d4, q10, d2, q8, , , , \dst, \src
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src
+ load_add_store , , d4, q10, q9, d3, d2, \dst, \src
+ load_add_store , , d5, q11, q10, d4, d3, \dst, \src
+ load_add_store , , , , q11, d5, d4, \dst, \src
+ load_add_store , , , , , , d5, \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ vld1.32 {\load[0]}, [\src, :32], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #4
+.endif
+.ifnb \load
+ vld1.32 {\load[1]}, [\src, :32], r1
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[0]}, [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[1]}, [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src
+ load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src
+ load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src
+ load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src
+ load_add_store4 , , , , q15, d7, d6, \dst, \src
+ load_add_store4 , , , , , , d7, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 , , , , q11, d3, d2, \dst, \src
+ load_add_store4 , , , , , , d3, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d0, r12
+ vqrdmulh.s16 d16, d16, d0[0]
+ vst1.16 {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s16 d16, d16, d0[0]
+.endif
+.if \shift > 0
+ vrshr.s16 d16, d16, #\shift
+.endif
+ vqrdmulh.s16 d20, d16, d0[0]
+ mov r3, #\h
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ subs r3, r3, #4
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q10, q8, d0
+ vqmovun.s16 d0, q10
+ vaddw.u8 q11, q8, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q11
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+ vld1.8 {d0}, [r0, :64], r1
+ vld1.8 {d1}, [r0, :64], r1
+ vld1.8 {d2}, [r0, :64], r1
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d3}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ subs r3, r3, #4
+ vaddw.u8 q11, q8, d1
+ vqmovun.s16 d0, q10
+ vaddw.u8 q12, q8, d2
+ vqmovun.s16 d1, q11
+ vaddw.u8 q13, q8, d3
+ vst1.8 {d0}, [r0, :64], r1
+ vqmovun.s16 d2, q12
+ vst1.8 {d1}, [r0, :64], r1
+ vqmovun.s16 d3, q13
+ vst1.8 {d2}, [r0, :64], r1
+ vst1.8 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+ vld1.8 {q0}, [r0, :128], r1
+ vld1.8 {q1}, [r0, :128], r1
+ vld1.8 {q2}, [r0, :128], r1
+ subs r3, r3, #4
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vld1.8 {q3}, [r0, :128], r1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q1}, [r0, :128], r1
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+ vld1.8 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.8 {q2, q3}, [r0, :128], r1
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #1
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #32
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.8 {q2, q3}, [r0, :128]
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, #32
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128]!
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i16 d16, d16, d17
+ vsub.i16 d21, d18, d19
+ vsub.i16 d20, d16, d21
+ vshr.s16 d20, d20, #1
+ vsub.i16 d18, d20, d17
+ vsub.i16 d17, d20, d19
+ vadd.i16 d19, d21, d18
+ vsub.i16 d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+ vmull_vmlal q3, \r1, \r3, d0[3], d0[2]
+ vmull_vmlsl q2, \r1, \r3, d0[2], d0[3]
+ vmull_vmlal q1, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d6, q3, #12
+ vqrshrn.s32 d7, q2, #12
+ vmull_vmlsl q2, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d2, q1, #12
+ vqrshrn.s32 d3, q2, #12
+ vqadd.s16 \r0, d2, d6
+ vqsub.s16 \r3, d2, d6
+ vqadd.s16 \r1, d3, d7
+ vqsub.s16 \r2, d3, d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2]
+ vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3]
+ vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d12, d13, q6, q7, #12
+ vqrshrn_8h d14, d15, q4, q5, #12
+ vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d4, d5, q2, q3, #12
+ vqrshrn_8h d6, d7, q4, q5, #12
+ vqadd.s16 \q0, q2, q6
+ vqsub.s16 \q3, q2, q6
+ vqadd.s16 \q1, q3, q7
+ vqsub.s16 \q2, q3, q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_4h_x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q1, d16, d18
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmull.s16 q10, d17, d0[3]
+ vaddw.s16 q1, q1, d19
+ vmull.s16 q3, d16, d0[2]
+ vmlsl.s16 q3, d18, d0[0]
+ vmlsl.s16 q3, d19, d0[1]
+
+ vadd.s32 q11, q2, q3
+ vmul.s32 q1, q1, d1[0]
+ vadd.s32 q2, q2, q10
+ vadd.s32 q3, q3, q10
+ vsub.s32 q11, q11, q10
+
+ vqrshrn.s32 \o0, q2, #12
+ vqrshrn.s32 \o2, q1, #12
+ vqrshrn.s32 \o1, q3, #12
+ vqrshrn.s32 \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 d19, d18, d17, d16
+ bx lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q2, d16, d20
+ vsubl.s16 q3, d17, d21
+ vmull.s16 q4, d16, d0[0]
+ vmlal.s16 q4, d20, d0[1]
+ vmlal.s16 q4, d22, d0[2]
+ vmull.s16 q5, d17, d0[0]
+ vmlal.s16 q5, d21, d0[1]
+ vmlal.s16 q5, d23, d0[2]
+ vaddw.s16 q2, q2, d22
+ vaddw.s16 q3, q3, d23
+ vmull.s16 q6, d16, d0[2]
+ vmlsl.s16 q6, d20, d0[0]
+ vmlsl.s16 q6, d22, d0[1]
+ vmull.s16 q7, d17, d0[2]
+ vmlsl.s16 q7, d21, d0[0]
+ vmlsl.s16 q7, d23, d0[1]
+
+ vmul.s32 q10, q2, d1[0]
+ vmul.s32 q11, q3, d1[0]
+
+ vmull.s16 q2, d18, d0[3]
+ vmull.s16 q3, d19, d0[3]
+
+ vadd.s32 q8, q4, q2 // out0
+ vadd.s32 q9, q5, q3
+
+ vadd.s32 q4, q4, q6 // out3
+ vadd.s32 q5, q5, q7
+
+ vadd.s32 q6, q6, q2 // out1
+ vadd.s32 q7, q7, q3
+
+ vsub.s32 q4, q4, q2 // out3
+ vsub.s32 q5, q5, q3
+
+ vqrshrn.s32 d20, q10, #12
+ vqrshrn.s32 d21, q11, #12
+
+ vqrshrn.s32 \o0, q8, #12
+ vqrshrn.s32 \o1, q9, #12
+
+.ifc \o4, d18
+ vmov q9, q10
+.endif
+
+ vqrshrn.s32 \o2, q6, #12
+ vqrshrn.s32 \o3, q7, #12
+
+ vqrshrn.s32 \o6, q4, #12
+ vqrshrn.s32 \o7, q5, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q2, q8, d0[0]
+ vqrdmulh.s16 q3, q9, d0[0]
+ vqadd.s16 q8, q8, q2
+ vqadd.s16 q9, q9, q3
+ bx lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q1, q8, d0[0]
+ vqrdmulh.s16 q2, q9, d0[0]
+ vqrdmulh.s16 q3, q10, d0[0]
+ vqadd.s16 q8, q8, q1
+ vqrdmulh.s16 q1, q11, d0[0]
+ vqadd.s16 q9, q9, q2
+ vqadd.s16 q10, q10, q3
+ vqadd.s16 q11, q11, q1
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+ vqrdmulh.s16 q1, \i, \c
+ vrhadd.s16 \i, \i, q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ vshr.s16 q8, q8, #2
+ vshr.s16 q9, q9, #2
+
+ iwht4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ iwht4
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ blx r4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+L(itx_4x4_end):
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q8, q8, d0
+ vqmovun.s16 d0, q8
+ vaddw.u8 q9, q9, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q9
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d4, r12
+ vst1.16 {d30[0]}, [r2, :16]
+ vqrdmulh.s16 d16, d16, d4[0]
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vqrdmulh.s16 d20, d16, d4[0]
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vmov q9, q8
+ vld1.32 {d1[1]}, [r0, :32], r1
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4h_x4_neon
+ movrel_local r5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+ vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a
+ vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a
+ vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a
+ vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a
+ vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a
+ vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
+ vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a
+ vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a
+
+ vqadd.s16 q2, \q1, \q3 // t4
+ vqsub.s16 \q1, \q1, \q3 // t5a
+ vqadd.s16 q3, \q7, \q5 // t7
+ vqsub.s16 \q3, \q7, \q5 // t6a
+
+ vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5
+ vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6
+ vqrshrn_8h d8, d9, q4, q5, #12 // t5
+ vqrshrn_8h d10, d11, q6, q7, #12 // t6
+
+ vqsub.s16 \q7, \q0, q3 // out7
+ vqadd.s16 \q0, \q0, q3 // out0
+ vqadd.s16 \q1, \q2, q5 // out1
+ vqsub.s16 q6, \q2, q5 // out6
+ vqadd.s16 \q2, \q4, q4 // out2
+ vqsub.s16 \q5, \q4, q4 // out5
+ vqadd.s16 \q3, \q6, q2 // out3
+ vqsub.s16 \q4, \q6, q2 // out4
+ vmov \q6, q6 // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4h_x4 \r0, \r2, \r4, \r6
+
+ vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a
+ vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a
+ vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a
+ vqrshrn.s32 \r1, q1, #12 // t4a
+ vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a
+ vqrshrn.s32 \r7, q2, #12 // t7a
+ vqrshrn.s32 \r3, q3, #12 // t5a
+ vqrshrn.s32 \r5, q1, #12 // taa
+
+ vqadd.s16 d2, \r1, \r3 // t4
+ vqsub.s16 \r1, \r1, \r3 // t5a
+ vqadd.s16 d3, \r7, \r5 // t7
+ vqsub.s16 \r3, \r7, \r5 // t6a
+
+ vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5
+ vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6
+ vqrshrn.s32 d4, q2, #12 // t5
+ vqrshrn.s32 d5, q3, #12 // t6
+
+ vqsub.s16 \r7, \r0, d3 // out7
+ vqadd.s16 \r0, \r0, d3 // out0
+ vqadd.s16 \r1, \r2, d5 // out1
+ vqsub.s16 d6, \r2, d5 // out6
+ vqadd.s16 \r2, \r4, d4 // out2
+ vqsub.s16 \r5, \r4, d4 // out5
+ vqadd.s16 \r3, \r6, d2 // out3
+ vqsub.s16 \r4, \r6, d2 // out4
+ vmov \r6, d6 // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1]
+ vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0]
+ vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3]
+ vqrshrn_8h d16, d17, q2, q3, #12 // t0a
+ vqrshrn_8h d30, d31, q4, q5, #12 // t1a
+ vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2]
+ vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1]
+ vqrshrn_8h d20, d21, q6, q7, #12 // t2a
+ vqrshrn_8h d26, d27, q2, q3, #12 // t3a
+ vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0]
+ vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3]
+ vqrshrn_8h d24, d25, q4, q5, #12 // t4a
+ vqrshrn_8h d22, d23, q6, q7, #12 // t5a
+ vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2]
+ vqrshrn_8h d28, d29, q2, q3, #12 // t6a
+ vqrshrn_8h d18, d19, q4, q5, #12 // t7a
+
+ vqadd.s16 q2, q8, q12 // t0
+ vqsub.s16 q3, q8, q12 // t4
+ vqadd.s16 q4, q15, q11 // t1
+ vqsub.s16 q5, q15, q11 // t5
+ vqadd.s16 q6, q10, q14 // t2
+ vqsub.s16 q7, q10, q14 // t6
+ vqadd.s16 q10, q13, q9 // t3
+ vqsub.s16 q11, q13, q9 // t7
+
+ vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2]
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3]
+ vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+ vqrshrn_8h d6, d7, q8, q9, #12 // t4a
+ vqrshrn_8h d10, d11, q12, q13, #12 // t5a
+
+ vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
+
+ vqrshrn_8h d14, d15, q14, q15, #12 // t6a
+ vqrshrn_8h d22, d23, q8, q9, #12 // t7a
+
+ vqadd.s16 \q0, q2, q6 // out0
+ vqsub.s16 q2, q2, q6 // t2
+ vqadd.s16 \q7, q4, q10 // out7
+ vqsub.s16 q4, q4, q10 // t3
+ vqneg.s16 \q7, \q7 // out7
+
+ vqadd.s16 \q1, q3, q7 // out1
+ vqsub.s16 q3, q3, q7 // t6
+ vqadd.s16 \q6, q5, q11 // out6
+ vqsub.s16 q5, q5, q11 // t7
+ vqneg.s16 \q1, \q1 // out1
+
+ vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12)
+ vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11)
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+ vqrshrn_8h d4, d5, q10, q11, #12 // out3
+ vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+ vqrshrn_8h d6, d7, q12, q13, #12 // out5
+ vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+ vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
+
+ vqneg.s16 \q3, q2 // out3
+ vqneg.s16 \q5, q3 // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal q2, d23, d16, d0[0], d0[1]
+ vmull_vmlsl q3, d23, d16, d0[1], d0[0]
+ vmull_vmlal q4, d21, d18, d0[2], d0[3]
+ vqrshrn.s32 d16, q2, #12 // t0a
+ vqrshrn.s32 d23, q3, #12 // t1a
+ vmull_vmlsl q5, d21, d18, d0[3], d0[2]
+ vmull_vmlal q6, d19, d20, d1[0], d1[1]
+ vqrshrn.s32 d18, q4, #12 // t2a
+ vqrshrn.s32 d21, q5, #12 // t3a
+ vmull_vmlsl q7, d19, d20, d1[1], d1[0]
+ vmull_vmlal q2, d17, d22, d1[2], d1[3]
+ vqrshrn.s32 d20, q6, #12 // t4a
+ vqrshrn.s32 d19, q7, #12 // t5a
+ vmull_vmlsl q3, d17, d22, d1[3], d1[2]
+ vqrshrn.s32 d22, q2, #12 // t6a
+ vqrshrn.s32 d17, q3, #12 // t7a
+
+ vqadd.s16 d4, d16, d20 // t0
+ vqsub.s16 d5, d16, d20 // t4
+ vqadd.s16 d6, d23, d19 // t1
+ vqsub.s16 d7, d23, d19 // t5
+ vqadd.s16 d8, d18, d22 // t2
+ vqsub.s16 d9, d18, d22 // t6
+ vqadd.s16 d18, d21, d17 // t3
+ vqsub.s16 d19, d21, d17 // t7
+
+ vmull_vmlal q8, d5, d7, d2[3], d2[2]
+ vmull_vmlsl q10, d5, d7, d2[2], d2[3]
+ vmull_vmlsl q11, d19, d9, d2[3], d2[2]
+
+ vqrshrn.s32 d5, q8, #12 // t4a
+ vqrshrn.s32 d7, q10, #12 // t5a
+
+ vmull_vmlal q8, d19, d9, d2[2], d2[3]
+
+ vqrshrn.s32 d9, q11, #12 // t6a
+ vqrshrn.s32 d19, q8, #12 // t7a
+
+ vqadd.s16 \r0, d4, d8 // out0
+ vqsub.s16 d4, d4, d8 // t2
+ vqadd.s16 \r7, d6, d18 // out7
+ vqsub.s16 d6, d6, d18 // t3
+ vqneg.s16 \r7, \r7 // out7
+
+ vqadd.s16 \r1, d5, d9 // out1
+ vqsub.s16 d5, d5, d9 // t6
+ vqadd.s16 \r6, d7, d19 // out6
+ vqsub.s16 d7, d7, d19 // t7
+ vqneg.s16 \r1, \r1 // out1
+
+ vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20)
+ vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19)
+ vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18)
+ vqrshrn.s32 d4, q9, #12 // out3
+ vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21)
+ vqrshrn.s32 d5, q10, #12 // out5
+ vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21)
+ vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19)
+
+ vqneg.s16 \r3, d4 // out3
+ vqneg.s16 \r5, d5 // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ vqshl.s16 q12, q12, #1
+ vqshl.s16 q13, q13, #1
+ vqshl.s16 q14, q14, #1
+ vqshl.s16 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ bx lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blx r4
+
+ vrshr.s16 q8, q8, #1
+ vrshr.s16 q9, q9, #1
+ vrshr.s16 q10, q10, #1
+ vrshr.s16 q11, q11, #1
+ vrshr.s16 q12, q12, #1
+ vrshr.s16 q13, q13, #1
+ vrshr.s16 q14, q14, #1
+ vrshr.s16 q15, q15, #1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x8h q8, q9, q10, q11
+ vswp d17, d20
+ vswp d19, d21
+ vswp d17, d18
+ vswp d19, d22
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+
+ vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a
+ vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a
+ vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a
+ vqrshrn.s32 d17, q2, #12 // t8a
+ vqrshrn.s32 d31, q3, #12 // t15a
+ vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a
+ vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a
+ vqrshrn.s32 d23, q4, #12 // t9a
+ vqrshrn.s32 d25, q2, #12 // t14a
+ vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a
+ vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a
+ vqrshrn.s32 d21, q3, #12 // t10a
+ vqrshrn.s32 d27, q4, #12 // t13a
+ vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a
+ vqrshrn.s32 d19, q2, #12 // t11a
+ vqrshrn.s32 d29, q3, #12 // t12a
+
+ idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ vqsub.s16 d4, d17, d23 // t9
+ vqadd.s16 d17, d17, d23 // t8
+ vqsub.s16 d5, d31, d25 // t14
+ vqadd.s16 d31, d31, d25 // t15
+ vqsub.s16 d23, d19, d21 // t10
+ vqadd.s16 d19, d19, d21 // t11
+ vqadd.s16 d25, d29, d27 // t12
+ vqsub.s16 d29, d29, d27 // t13
+
+ vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a
+ vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a
+ vqrshrn.s32 d21, q3, #12 // t9a
+ vqrshrn.s32 d27, q4, #12 // t14a
+
+ vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a
+ vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a
+ vqrshrn.s32 d29, q3, #12 // t13a
+ vneg.s32 q4, q4
+ vqrshrn.s32 d23, q4, #12 // t10a
+
+ vqsub.s16 d4, d17, d19 // t11a
+ vqadd.s16 d17, d17, d19 // t8a
+ vqsub.s16 d5, d31, d25 // t12a
+ vqadd.s16 d31, d31, d25 // t15a
+ vqadd.s16 d19, d21, d23 // t9
+ vqsub.s16 d21, d21, d23 // t10
+ vqsub.s16 d25, d27, d29 // t13
+ vqadd.s16 d27, d27, d29 // t14
+
+ vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11
+ vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12
+ vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
+
+ vqrshrn.s32 d6, q3, #12 // t11
+ vqrshrn.s32 d7, q4, #12 // t12
+ vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
+ vqrshrn.s32 d4, q2, #12 // t10a
+ vqrshrn.s32 d5, q4, #12 // t13a
+
+ vqadd.s16 d8, d16, d31 // out0
+ vqsub.s16 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s16 d23, d30, d17 // out7
+ vqsub.s16 d9, d30, d17 // out8
+ vqadd.s16 d17, d18, d27 // out1
+ vqsub.s16 d30, d18, d27 // out14
+ vqadd.s16 d18, d20, d5 // out2
+ vqsub.s16 d29, d20, d5 // out13
+ vqadd.s16 d5, d28, d19 // out6
+ vqsub.s16 d25, d28, d19 // out9
+ vqadd.s16 d19, d22, d7 // out3
+ vqsub.s16 d28, d22, d7 // out12
+ vqadd.s16 d20, d24, d6 // out4
+ vqsub.s16 d27, d24, d6 // out11
+ vqadd.s16 d21, d26, d4 // out5
+ vqsub.s16 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+
+ vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0
+ vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1
+ vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2
+ vqrshrn.s32 d16, q2, #12 // t0
+ vqrshrn.s32 d31, q3, #12 // t1
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3
+ vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4
+ vqrshrn.s32 d18, q4, #12 // t2
+ vqrshrn.s32 d29, q2, #12 // t3
+ vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5
+ vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6
+ vqrshrn.s32 d20, q3, #12 // t4
+ vqrshrn.s32 d27, q4, #12 // t5
+ vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7
+ vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8
+ vqrshrn.s32 d22, q2, #12 // t6
+ vqrshrn.s32 d25, q3, #12 // t7
+ vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9
+ vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10
+ vqrshrn.s32 d23, q4, #12 // t8
+ vqrshrn.s32 d24, q2, #12 // t9
+ vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11
+ vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12
+ vqrshrn.s32 d21, q3, #12 // t10
+ vqrshrn.s32 d26, q4, #12 // t11
+ vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13
+ vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14
+ vqrshrn.s32 d19, q2, #12 // t12
+ vqrshrn.s32 d28, q3, #12 // t13
+ vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15
+ vqrshrn.s32 d17, q4, #12 // t14
+ vqrshrn.s32 d30, q2, #12 // t15
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d23 // t8a
+ vqadd.s16 d16, d16, d23 // t0a
+ vqsub.s16 d3, d31, d24 // t9a
+ vqadd.s16 d31, d31, d24 // t1a
+ vqadd.s16 d23, d18, d21 // t2a
+ vqsub.s16 d18, d18, d21 // t10a
+ vqadd.s16 d24, d29, d26 // t3a
+ vqsub.s16 d29, d29, d26 // t11a
+ vqadd.s16 d21, d20, d19 // t4a
+ vqsub.s16 d20, d20, d19 // t12a
+ vqadd.s16 d26, d27, d28 // t5a
+ vqsub.s16 d27, d27, d28 // t13a
+ vqadd.s16 d19, d22, d17 // t6a
+ vqsub.s16 d22, d22, d17 // t14a
+ vqadd.s16 d28, d25, d30 // t7a
+ vqsub.s16 d25, d25, d30 // t15a
+
+ vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8
+ vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9
+ vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10
+ vqrshrn.s32 d17, q2, #12 // t8
+ vqrshrn.s32 d30, q3, #12 // t9
+ vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11
+ vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12
+ vqrshrn.s32 d18, q4, #12 // t10
+ vqrshrn.s32 d29, q2, #12 // t11
+ vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13
+ vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14
+ vqrshrn.s32 d27, q3, #12 // t12
+ vqrshrn.s32 d20, q4, #12 // t13
+ vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15
+ vqrshrn.s32 d25, q2, #12 // t14
+ vqrshrn.s32 d22, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t4
+ vqadd.s16 d16, d16, d21 // t0
+ vqsub.s16 d3, d31, d26 // t5
+ vqadd.s16 d31, d31, d26 // t1
+ vqadd.s16 d21, d23, d19 // t2
+ vqsub.s16 d23, d23, d19 // t6
+ vqadd.s16 d26, d24, d28 // t3
+ vqsub.s16 d24, d24, d28 // t7
+ vqadd.s16 d19, d17, d27 // t8a
+ vqsub.s16 d17, d17, d27 // t12a
+ vqadd.s16 d28, d30, d20 // t9a
+ vqsub.s16 d30, d30, d20 // t13a
+ vqadd.s16 d27, d18, d25 // t10a
+ vqsub.s16 d18, d18, d25 // t14a
+ vqadd.s16 d20, d29, d22 // t11a
+ vqsub.s16 d29, d29, d22 // t15a
+
+ vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a
+ vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a
+ vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a
+ vqrshrn.s32 d22, q2, #12 // t4a
+ vqrshrn.s32 d25, q3, #12 // t5a
+ vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a
+ vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12
+ vqrshrn.s32 d24, q4, #12 // t6a
+ vqrshrn.s32 d23, q2, #12 // t7a
+ vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14
+ vqrshrn.s32 d17, q3, #12 // t12
+ vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15
+ vqrshrn.s32 d29, q4, #12 // t13
+ vqrshrn.s32 d30, q2, #12 // t14
+ vqrshrn.s32 d18, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s16 \o0, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+.else
+ vqadd.s16 d4, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+ vqneg.s16 \o15, \o15 // out15
+
+ vqsub.s16 d3, d29, d18 // t15a
+ vqadd.s16 \o13,d29, d18 // out13
+ vqadd.s16 \o2, d17, d30 // out2
+ vqsub.s16 d26, d17, d30 // t14a
+ vqneg.s16 \o13,\o13 // out13
+
+ vqadd.s16 \o1, d19, d27 // out1
+ vqsub.s16 d27, d19, d27 // t10
+ vqadd.s16 \o14,d28, d20 // out14
+ vqsub.s16 d20, d28, d20 // t11
+ vqneg.s16 \o1, \o1 // out1
+
+ vqadd.s16 \o3, d22, d24 // out3
+ vqsub.s16 d22, d22, d24 // t6
+ vqadd.s16 \o12,d25, d23 // out12
+ vqsub.s16 d23, d25, d23 // t7
+ vqneg.s16 \o3, \o3 // out3
+
+ vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vqrshrn.s32 d24, q12, #12 // out8
+ vqrshrn.s32 d4, q2, #12 // out7
+ vqrshrn.s32 d5, q3, #12 // out5
+ vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vqrshrn.s32 d26, q4, #12 // out10
+
+ vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vqrshrn.s32 \o4, q1, #12 // out4
+ vqrshrn.s32 d7, q3, #12 // out9
+ vqrshrn.s32 d6, q4, #12 // out11
+ vqrshrn.s32 \o6, q11, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s16 \o7, d4 // out7
+ vqneg.s16 \o5, d5 // out5
+ vqneg.s16 \o11,d6 // out11
+ vqneg.s16 \o9, d7 // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q1, \i, d0[0]
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vshr.s16 q2, q2, #1
+ vrhadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vrshr.s16 q2, q2, #1
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+ identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+.if \identity
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.endif
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+ identity_4x16_shift2 d0[0]
+.else
+ identity_4x16_shift1 d0[0]
+.endif
+.else
+ blx r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #\shift
+.endr
+.endif
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+ vst1.16 {\i}, [r6, :64]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ blx r9
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4}
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_identity_16x4_neon
+.else
+ movrel_local r9, inv_txfm_horz_16x4_neon
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+ vmov.i16 d4, #0
+.irp i, d16, d18, d20, d22
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, d24, d26, d28, d30
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+
+ identity_4x16_shift1 d0[0]
+.else
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+
+ blx r4
+
+ vswp d17, d20
+ vswp d19, d22
+ vswp d18, d20
+ vswp d19, d21
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q14
+ vmov q11, q15
+.else
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+ vrshr.s16 q8, q12, #1
+ vrshr.s16 q9, q13, #1
+ vrshr.s16 q10, q14, #1
+ vrshr.s16 q11, q15, #1
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ add r6, r0, #8
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ vmov.i16 q2, #0
+
+ mov r11, #32
+ cmp r3, r10
+ blt 1f
+
+ add r6, r2, #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vrshr.s16 q12, q8, #1
+ vrshr.s16 q13, q9, #1
+ vrshr.s16 q14, q10, #1
+ vrshr.s16 q15, q11, #1
+.endif
+ transpose_4x8h q12, q13, q14, q15
+ vswp d27, d29
+ vswp d26, d28
+ vswp d27, d30
+ vswp d25, d28
+
+ b 2f
+1:
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+2:
+ vmov.i16 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+.else
+ blx r4
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ vswp d19, d21
+ vswp d18, d20
+ vswp d19, d22
+ vswp d17, d20
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
+ mov r10, #\eob_half
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ sub_sp_align 256
+
+.irp i, 0, 4
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #8*2
+ blx r9
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ sub_sp_align 256
+
+.irp i, 0, 8
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+
+ vmov.i16 q2, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128]
+ vst1.16 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+ // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+ blx r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \j, \j, #1
+.endr
+.endif
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+ vst1.16 {q8, q9}, [r6, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+ movrel_local r9, inv_txfm_horz_scale_16x4_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+ mov r10, #\eob_8x8
+.else
+ mov r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs, 2*16
+ vld1.16 {q0, q1}, [r12, :128]
+ sub r12, r12, #2*16
+
+ vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a
+ vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a
+ vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a
+ vqrshrn.s32 d16, q2, #12 // t16a
+ vqrshrn.s32 d31, q3, #12 // t31a
+ vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a
+ vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a
+ vqrshrn.s32 d24, q4, #12 // t17a
+ vqrshrn.s32 d23, q2, #12 // t30a
+ vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a
+ vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a
+ vqrshrn.s32 d20, q3, #12 // t18a
+ vqrshrn.s32 d27, q4, #12 // t29a
+ vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a
+ vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a
+ vqrshrn.s32 d28, q2, #12 // t19a
+ vqrshrn.s32 d19, q3, #12 // t28a
+ vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a
+ vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a
+ vqrshrn.s32 d18, q4, #12 // t20a
+ vqrshrn.s32 d29, q2, #12 // t27a
+ vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a
+ vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a
+ vqrshrn.s32 d26, q3, #12 // t21a
+ vqrshrn.s32 d21, q4, #12 // t26a
+ vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a
+ vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a
+ vqrshrn.s32 d22, q2, #12 // t22a
+ vqrshrn.s32 d25, q3, #12 // t25a
+ vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a
+ vqrshrn.s32 d30, q4, #12 // t23a
+ vqrshrn.s32 d17, q2, #12 // t24a
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d24 // t17
+ vqadd.s16 d16, d16, d24 // t16
+ vqsub.s16 d3, d31, d23 // t30
+ vqadd.s16 d31, d31, d23 // t31
+ vqsub.s16 d24, d28, d20 // t18
+ vqadd.s16 d28, d28, d20 // t19
+ vqadd.s16 d23, d18, d26 // t20
+ vqsub.s16 d18, d18, d26 // t21
+ vqsub.s16 d20, d30, d22 // t22
+ vqadd.s16 d30, d30, d22 // t23
+ vqadd.s16 d26, d17, d25 // t24
+ vqsub.s16 d17, d17, d25 // t25
+ vqsub.s16 d22, d29, d21 // t26
+ vqadd.s16 d29, d29, d21 // t27
+ vqadd.s16 d25, d19, d27 // t28
+ vqsub.s16 d19, d19, d27 // t29
+
+ vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a
+ vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a
+ vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a
+ vqrshrn.s32 d21, q2, #12 // t17a
+ vqrshrn.s32 d27, q3, #12 // t30a
+ vneg.s32 q4, q4 // -> t18a
+ vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a
+ vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a
+ vqrshrn.s32 d19, q4, #12 // t18a
+ vqrshrn.s32 d24, q1, #12 // t29a
+ vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a
+ vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a
+ vqrshrn.s32 d22, q2, #12 // t21a
+ vqrshrn.s32 d18, q3, #12 // t26a
+ vneg.s32 q4, q4 // -> t22a
+ vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a
+ vqrshrn.s32 d17, q4, #12 // t22a
+ vqrshrn.s32 d20, q1, #12 // t25a
+
+ vqsub.s16 d2, d27, d24 // t29
+ vqadd.s16 d27, d27, d24 // t30
+ vqsub.s16 d3, d21, d19 // t18
+ vqadd.s16 d21, d21, d19 // t17
+ vqsub.s16 d24, d16, d28 // t19a
+ vqadd.s16 d16, d16, d28 // t16a
+ vqsub.s16 d19, d30, d23 // t20a
+ vqadd.s16 d30, d30, d23 // t23a
+ vqsub.s16 d28, d17, d22 // t21
+ vqadd.s16 d17, d17, d22 // t22
+ vqadd.s16 d23, d26, d29 // t24a
+ vqsub.s16 d26, d26, d29 // t27a
+ vqadd.s16 d22, d20, d18 // t25
+ vqsub.s16 d20, d20, d18 // t26
+ vqsub.s16 d29, d31, d25 // t28a
+ vqadd.s16 d31, d31, d25 // t31a
+
+ vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a
+ vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a
+ vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19
+ vqrshrn.s32 d18, q2, #12 // t18a
+ vqrshrn.s32 d25, q3, #12 // t29a
+ vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28
+ vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20
+ vqrshrn.s32 d29, q4, #12 // t19
+ vqrshrn.s32 d24, q1, #12 // t28
+ vneg.s32 q2, q2 // -> t20
+ vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27
+ vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a
+ vqrshrn.s32 d26, q2, #12 // t20
+ vqrshrn.s32 d19, q3, #12 // t27
+ vneg.s32 q4, q4 // -> t21a
+ vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a
+ vqrshrn.s32 d20, q4, #12 // t21a
+ vqrshrn.s32 d28, q1, #12 // t26a
+
+ vqsub.s16 d2, d16, d30 // t23
+ vqadd.s16 d16, d16, d30 // t16 = out16
+ vqsub.s16 d3, d31, d23 // t24
+ vqadd.s16 d31, d31, d23 // t31 = out31
+ vqsub.s16 d23, d21, d17 // t22a
+ vqadd.s16 d17, d21, d17 // t17a = out17
+ vqadd.s16 d30, d27, d22 // t30a = out30
+ vqsub.s16 d21, d27, d22 // t25a
+ vqsub.s16 d27, d18, d20 // t21
+ vqadd.s16 d18, d18, d20 // t18 = out18
+ vqadd.s16 d4, d29, d26 // t19a = out19
+ vqsub.s16 d26, d29, d26 // t20a
+ vqadd.s16 d29, d25, d28 // t29 = out29
+ vqsub.s16 d25, d25, d28 // t26
+ vqadd.s16 d28, d24, d19 // t28a = out28
+ vqsub.s16 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+ vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20
+ vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27
+ vqrshrn.s32 d20, q2, #12 // t20
+ vqrshrn.s32 d22, q3, #12 // t27
+
+ vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a
+ vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vqrshrn.s32 d26, q2, #12 // t26a
+
+ vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22
+ vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25
+ vqrshrn.s32 d21, q3, #12 // t21a
+ vqrshrn.s32 d22, q12, #12 // t22
+ vqrshrn.s32 d25, q2, #12 // t25
+
+ vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a
+ vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a
+ vqrshrn.s32 d23, q2, #12 // t23a
+ vqrshrn.s32 d24, q3, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_4h_x16_neon
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+ add r6, r6, #32
+.endm
+ store1 d16, d20, d24, d28
+ store1 d17, d21, d25, d29
+ store1 d18, d22, d26, d30
+ store1 d19, d23, d27, d31
+.purgem store1
+ sub r6, r6, #64*4
+
+ vmov.i16 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_4h_x16_neon
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+ transpose_4x4h q11, q10, d23, d22, d21, d20
+ transpose_4x4h q9, q8, d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+ vld1.16 {q0, q1}, [r6, :128]
+ vqsub.s16 d7, d0, \r0
+ vqadd.s16 d0, d0, \r0
+ vqsub.s16 d6, d1, \r1
+ vqadd.s16 d1, d1, \r1
+ vqsub.s16 d5, d2, \r2
+ vqadd.s16 d2, d2, \r2
+ vqsub.s16 d4, d3, \r3
+ vqadd.s16 d3, d3, \r3
+ vrev64.16 q2, q2
+ vrev64.16 q3, q3
+ vrshr.s16 q0, q0, #\shift
+ vrshr.s16 q1, q1, #\shift
+ vrshr.s16 q2, q2, #\shift
+ vrshr.s16 q3, q3, #\shift
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d27, d23, d19, \shift
+ store2 d30, d26, d22, d18, \shift
+ store2 d29, d25, d21, d17, \shift
+ store2 d28, d24, d20, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl inv_dct32_odd_4h_x16_neon
+
+ neg r9, r8
+ mov r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.32 {d2[0]}, [r10, :32], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.32 {d2[1]}, [r10, :32], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.32 {d3[0]}, [r10, :32], r1
+ \op\().s16 d5, d5, \r1
+ vld1.32 {d3[1]}, [r10, :32], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vaddw.u8 q2, q2, d2
+ \op\().s16 d7, d7, \r3
+ vqmovun.s16 d2, q2
+ vrshr.s16 q3, q3, #4
+ vst1.32 {d2[0]}, [r6, :32], r1
+ vaddw.u8 q3, q3, d3
+ vst1.32 {d2[1]}, [r6, :32], r1
+ vqmovun.s16 d3, q3
+ vst1.32 {d3[0]}, [r6, :32], r1
+ vst1.32 {d3[1]}, [r6, :32], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ // Contrary to the others, this one is only ever used in increments of 8x8
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vmov.i16 q0, #0
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #2*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 2
+2:
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r6
+.endr
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ load_add_store_8x8 r0, r7, shiftbits=2
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r7,lr}
+ movw r6, #2896*8
+ movw r7, #2*(5793-4096)*8
+ vdup.i16 d0, r6
+ movrel_local r5, eob_16x32\hshort, 2
+ vmov.16 d0[1], r7
+
+ mov r6, #2*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 2
+2:
+ vmov.i16 q1, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s16, 1
+ identity_8x8 d0[1]
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+ load_add_store_8x8 r0, r7, shiftbits=2
+.else
+ load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q0, #0
+ movrel_local r4, eob_8x32
+
+ mov r12, #2*\h
+1:
+ ldrh lr, [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs vrshr.s16, 1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ cmp r3, lr
+.if \w == 8
+ load_add_store_8x8 r0, r5, shiftbits=2
+.else
+ load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+ blt 9f
+.if \w == 8
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #2*8
+.else
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #8
+.endif
+ b 1b
+
+9:
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r5, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32
+
+ mov r8, #2*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i16 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #2
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #8
+ add r2, r2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #2
+.endr
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #8
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+.irp i, 0, 4
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ cmp r3, #10
+ blt 1f
+.endif
+ mov r8, #8*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+
+2:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.16 {d0, d1, d2}, [r12, :64]!
+
+ vqrdmulh.s16 d23, d16, d0[1] // t63a
+ vqrdmulh.s16 d16, d16, d0[0] // t32a
+ vqrdmulh.s16 d22, d17, d0[2] // t62a
+ vqrdmulh.s16 d17, d17, d0[3] // t33a
+ vqrdmulh.s16 d21, d18, d1[1] // t61a
+ vqrdmulh.s16 d18, d18, d1[0] // t34a
+ vqrdmulh.s16 d20, d19, d1[2] // t60a
+ vqrdmulh.s16 d19, d19, d1[3] // t35a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t33
+ vqsub.s16 d26, d19, d18 // t34
+ vqadd.s16 d27, d19, d18 // t35
+ vqadd.s16 d28, d20, d21 // t60
+ vqsub.s16 d29, d20, d21 // t61
+ vqsub.s16 d30, d23, d22 // t62
+ vqadd.s16 d31, d23, d22 // t63
+
+ vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a
+ vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a
+ vneg.s32 q2, q2 // t34a
+ vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a
+ vqrshrn.s32 d26, q2, #12 // t34a
+ vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a
+ vqrshrn.s32 d29, q3, #12 // t61a
+ vqrshrn.s32 d25, q4, #12 // t33a
+ vqrshrn.s32 d30, q2, #12 // t62a
+
+ vqadd.s16 d16, d24, d27 // t32a
+ vqsub.s16 d19, d24, d27 // t35a
+ vqadd.s16 d17, d25, d26 // t33
+ vqsub.s16 d18, d25, d26 // t34
+ vqsub.s16 d20, d31, d28 // t60a
+ vqadd.s16 d23, d31, d28 // t63a
+ vqsub.s16 d21, d30, d29 // t61
+ vqadd.s16 d22, d30, d29 // t62
+
+ vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a
+ vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a
+ vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60
+ vqrshrn.s32 d21, q2, #12 // t61a
+ vqrshrn.s32 d18, q3, #12 // t34a
+ vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35
+ vqrshrn.s32 d20, q4, #12 // t60
+ vqrshrn.s32 d19, q2, #12 // t35
+
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #2*4*0] // t32a
+ vldr d17, [r9, #2*4*8] // t39a
+ vldr d18, [r9, #2*4*0] // t63a
+ vldr d19, [r6, #2*4*8] // t56a
+ vldr d20, [r6, #2*4*16] // t40a
+ vldr d21, [r9, #2*4*24] // t47a
+ vldr d22, [r9, #2*4*16] // t55a
+ vldr d23, [r6, #2*4*24] // t48a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t39
+ vqadd.s16 d26, d18, d19 // t63
+ vqsub.s16 d27, d18, d19 // t56
+ vqsub.s16 d28, d21, d20 // t40
+ vqadd.s16 d29, d21, d20 // t47
+ vqadd.s16 d30, d23, d22 // t48
+ vqsub.s16 d31, d23, d22 // t55
+
+ vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a
+ vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a
+ vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a
+ vqrshrn.s32 d25, q2, #12 // t56a
+ vqrshrn.s32 d27, q3, #12 // t39a
+ vneg.s32 q4, q4 // t40a
+ vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a
+ vqrshrn.s32 d31, q4, #12 // t40a
+ vqrshrn.s32 d28, q2, #12 // t55a
+
+ vqadd.s16 d16, d24, d29 // t32a
+ vqsub.s16 d19, d24, d29 // t47a
+ vqadd.s16 d17, d27, d31 // t39
+ vqsub.s16 d18, d27, d31 // t40
+ vqsub.s16 d20, d26, d30 // t48a
+ vqadd.s16 d23, d26, d30 // t63a
+ vqsub.s16 d21, d25, d28 // t55
+ vqadd.s16 d22, d25, d28 // t56
+
+ vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a
+ vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a
+ vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47
+ vqrshrn.s32 d18, q2, #12 // t40a
+ vqrshrn.s32 d21, q3, #12 // t55a
+ vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48
+ vqrshrn.s32 d19, q4, #12 // t47
+ vqrshrn.s32 d20, q2, #12 // t48
+
+ vstr d16, [r6, #2*4*0] // t32a
+ vstr d17, [r9, #2*4*0] // t39
+ vstr d18, [r6, #2*4*8] // t40a
+ vstr d19, [r9, #2*4*8] // t47
+ vstr d20, [r6, #2*4*16] // t48
+ vstr d21, [r9, #2*4*16] // t55a
+ vstr d22, [r6, #2*4*24] // t56
+ vstr d23, [r9, #2*4*24] // t63a
+
+ add r6, r6, #2*4
+ sub r9, r9, #2*4
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.16 {\i}, [\src, :64]
+ vst1.16 {\zero}, [\src, :64], \strd
+.else
+ vld1.16 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.16 {q8, q9}, [\dst, :128]!
+ vst1.16 {q10, q11}, [\dst, :128]!
+ vst1.16 {q12, q13}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i16 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ movw \gpr, \val
+ vdup.16 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.16 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_4h_x16_neon
+
+ store16 r6
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_4h_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+.macro store_addsub r0, r1, r2, r3
+ vld1.16 {d2}, [r6, :64]!
+ vld1.16 {d3}, [r6, :64]!
+ vqadd.s16 d6, d2, \r0
+ vqsub.s16 \r0, d2, \r0
+ vld1.16 {d4}, [r6, :64]!
+ vqadd.s16 d7, d3, \r1
+ vqsub.s16 \r1, d3, \r1
+ vld1.16 {d5}, [r6, :64]!
+ vqadd.s16 d2, d4, \r2
+ sub r6, r6, #8*4
+ vqsub.s16 \r2, d4, \r2
+ vst1.16 {d6}, [r6, :64]!
+ vst1.16 {\r0}, [r10, :64], r9
+ vqadd.s16 d3, d5, \r3
+ vqsub.s16 \r3, d5, \r3
+ vst1.16 {d7}, [r6, :64]!
+ vst1.16 {\r1}, [r10, :64], r9
+ vst1.16 {d2}, [r6, :64]!
+ vst1.16 {\r2}, [r10, :64], r9
+ vst1.16 {d3}, [r6, :64]!
+ vst1.16 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.16 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.16 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.16 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.16 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.16 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.16 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.16 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.16 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.16 d16, [r10, :64] // in5 (offset 2)
+ vld1.16 d17, [r11, :64] // in27 (offset 13)
+ vld1.16 d18, [r9, :64] // in21 (offset 10)
+ vld1.16 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.16 d16, [r10, :64] // in3 (offset 1)
+ vld1.16 d17, [r11, :64] // in29 (offset 14)
+ vld1.16 d18, [r9, :64] // in19 (offset 9)
+ vld1.16 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+ vdup.16 q3, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+ vqsub.s16 d3, \src0, \src1
+ vqsub.s16 d2, \src2, \src3
+ vqadd.s16 d0, \src0, \src1
+ vqadd.s16 d1, \src2, \src3
+ vrshl.s16 q1, q1, q3
+ vrshl.s16 q0, q0, q3
+ vrev64.16 q1, q1
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q1}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d20, d27
+ store_addsub d17, d30, d21, d26
+ store_addsub d18, d29, d22, d25
+ store_addsub d19, d28, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #2
+ sub r9, r9, r10, lsl #2
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.32 {d0[0]}, [r6, :32], r1
+ vld1.32 {d1[0]}, [r9, :32], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.32 {d0[1]}, [r6, :32]
+ vqadd.s16 d5, \src2, \src3
+ vld1.32 {d1[1]}, [r9, :32]
+ vqsub.s16 d6, \src0, \src1
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vaddw.u8 q2, q2, d0
+ vaddw.u8 q3, q3, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r6, :32], r1
+ vst1.32 {d1[0]}, [r9, :32], r10
+ vst1.32 {d0[1]}, [r6, :32], r1
+ vst1.32 {d1[1]}, [r9, :32], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_scale_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel_local r5, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
new file mode 100644
index 0000000000..aa6c272e71
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -0,0 +1,3625 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+// A macro for cases where a thumb mov can express the constant in one
+// instruction, while arm mode requires two separate movw+movt pairs.
+.macro mov_const reg, val
+#if CONFIG_THUMB
+ mov.w \reg, #\val
+#else
+ movw \reg, #((\val) & 0xffff)
+ movt \reg, #(((\val) >> 16) & 0xffff)
+#endif
+.endm
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro vmul_vmla d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmla.i32 \d0, \s1, \c1
+.endm
+
+.macro vmul_vmls d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmls.i32 \d0, \s1, \c1
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s32 \r0, \r0, \c
+ vqrdmulh.s32 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s32 \r2, \r2, \c
+ vqrdmulh.s32 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s32 \r4, \r4, \c
+ vqrdmulh.s32 \r5, \r5, \c
+ vqrdmulh.s32 \r6, \r6, \c
+ vqrdmulh.s32 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.16 {\load}, [\src, :128], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store
+ vst1.16 {\store}, [\dst, :128], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits
+ load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits
+ load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits
+ load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits
+ load_add_store , , , , , q15, q14, \dst, \src, \shiftbits
+ load_add_store , , , , , , q15, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store , , , , , q11, q10, \dst, \src, \shiftbits
+ load_add_store , , , , , , q11, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4
+.ifnb \load1
+ vld1.16 {\load1}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \load2
+ vld1.16 {\load2}, [\src, :64], r1
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \store1
+ vst1.16 {\store1}, [\dst, :64], r1
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store2
+ vst1.16 {\store2}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src
+ load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src
+ load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src
+ load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src
+ load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src
+ load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src
+ load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src
+ load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src
+ load_add_store4 , , , , , , q15, d28, d29, \dst, \src
+ load_add_store4 , , , , , , , d30, d31, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_4x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d24[], d25[]}, [r2, :32]
+ vdup.32 d0, r12
+ vqrdmulh.s32 q13, q12, d0[0]
+ vst1.32 {d28[0]}, [r2, :32]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s32 q13, q13, d0[0]
+.endif
+.if \shift > 0
+ vqrshrn.s32 d24, q13, #\shift
+ vqrshrn.s32 d25, q13, #\shift
+.else
+ vqmovn.s32 d24, q13
+ vqmovn.s32 d25, q13
+.endif
+ vqrdmulh.s16 q12, q12, d0[1]
+ mov r3, #\h
+ vrshr.s16 q12, q12, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vld1.16 {d2}, [r0, :64], r1
+ vld1.16 {d3}, [r0, :64], r1
+ subs r3, r3, #4
+ vqadd.s16 q0, q0, q12
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q1, q1, q12
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmin.s16 q0, q0, q15
+ vst1.16 {d0}, [r0, :64], r1
+ vmin.s16 q1, q1, q15
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0}, [r0, :128], r1
+ subs r3, r3, #4
+ vld1.16 {q1}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q2}, [r0, :128], r1
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q3}, [r0, :128], r1
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #2
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vst1.16 {q0}, [r0, :128], r1
+ vmin.s16 q2, q2, q15
+ vst1.16 {q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.16 {q2, q3}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #1
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+ sub r1, r1, #32
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, #32
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #96
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]!
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q8, q9}, [r0, :128]!
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q10, q11}, [r0, :128]
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ vqadd.s16 q8, q8, q12
+ vqadd.s16 q9, q9, q12
+ vqadd.s16 q10, q10, q12
+ vqadd.s16 q11, q11, q12
+ sub r0, r0, #96
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmax.s16 q10, q10, q14
+ vmax.s16 q11, q11, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q8, q8, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q9, q9, q15
+ vst1.16 {q2, q3}, [r0, :128]!
+ vmin.s16 q10, q10, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vmin.s16 q11, q11, q15
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i32 q8, q8, q9
+ vsub.i32 q13, q10, q11
+ vsub.i32 q12, q8, q13
+ vshr.s32 q12, q12, #1
+ vsub.i32 q10, q12, q9
+ vsub.i32 q9, q12, q11
+ vadd.i32 q11, q13, q10
+ vsub.i32 q8, q8, q9
+.endm
+
+.macro idct_4s_x4 r0, r1, r2, r3
+ vmul_vmla q4, \r1, \r3, d1[1], d1[0]
+ vmul_vmla q2, \r0, \r2, d0[0], d0[0]
+ vmul_vmls q3, \r1, \r3, d1[0], d1[1]
+ vmul_vmls q5, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 q4, q4, #12
+ vrshr.s32 q2, q2, #12
+ vrshr.s32 q3, q3, #12
+ vrshr.s32 q5, q5, #12
+ vqadd.s32 \r0, q2, q4
+ vqsub.s32 \r3, q2, q4
+ vqadd.s32 \r1, q5, q3
+ vqsub.s32 \r2, q5, q3
+.endm
+
+.macro idct_2s_x4 r0, r1, r2, r3
+ vmul_vmla d6, \r1, \r3, d1[1], d1[0]
+ vmul_vmla d4, \r0, \r2, d0[0], d0[0]
+ vmul_vmls d5, \r1, \r3, d1[0], d1[1]
+ vmul_vmls d7, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 d6, d6, #12
+ vrshr.s32 d4, d4, #12
+ vrshr.s32 d5, d5, #12
+ vrshr.s32 d7, d7, #12
+ vqadd.s32 \r0, d4, d6
+ vqsub.s32 \r3, d4, d6
+ vqadd.s32 \r1, d7, d5
+ vqsub.s32 \r2, d7, d5
+.endm
+
+function inv_dct_4s_x4_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+ idct_4s_x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+
+ vsub.i32 q1, q8, q10
+ vmul.i32 q2, q8, d0[0]
+ vmla.i32 q2, q10, d0[1]
+ vmla.i32 q2, q11, d1[0]
+ vmul.i32 q4, q9, d1[1]
+ vadd.i32 q1, q1, q11
+ vmul.i32 q3, q8, d1[0]
+ vmls.i32 q3, q10, d0[0]
+ vmls.i32 q3, q11, d0[1]
+
+ vadd.i32 \o3, q2, q3
+ vmul.i32 \o2, q1, d1[1]
+ vadd.i32 \o0, q2, q4
+ vadd.i32 \o1, q3, q4
+ vsub.i32 \o3, \o3, q4
+
+ vrshr.s32 \o0, \o0, #12
+ vrshr.s32 \o2, \o2, #12
+ vrshr.s32 \o1, \o1, #12
+ vrshr.s32 \o3, \o3, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ iadst_4x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ iadst_4x4 q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x4_neon
+ mov r12, #0
+ movt r12, #(5793-4096)*8
+ vdup.32 d0, r12
+ vqrdmulh.s32 q1, q8, d0[0]
+ vqrdmulh.s32 q2, q9, d0[0]
+ vqrdmulh.s32 q3, q10, d0[0]
+ vqrdmulh.s32 q4, q11, d0[0]
+ vqadd.s32 q8, q8, q1
+ vqadd.s32 q9, q9, q2
+ vqadd.s32 q10, q10, q3
+ vqadd.s32 q11, q11, q4
+ bx lr
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q14, q15}, [r2, :128]!
+ vshr.s16 q8, q8, #2
+ vld1.32 {q10, q11}, [r2, :128]
+ vshr.s16 q9, q9, #2
+ vshr.s16 q10, q10, #2
+ vshr.s16 q11, q11, #2
+
+ iwht4
+
+ vst1.32 {q14, q15}, [r2, :128]
+ transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+
+ iwht4
+
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d16, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqmovn.s32 d17, q9
+ vld1.16 {d2}, [r0, :64], r1
+ vqmovn.s32 d18, q10
+ vld1.16 {d3}, [r0, :64], r1
+ vqmovn.s32 d19, q11
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vrshr.s16 q8, q8, #4
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q9, q9, #4
+ vld1.16 {d3}, [r0, :64], r1
+
+L(itx_4x4_end):
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q8, q8, q0
+ vqadd.s16 q9, q9, q1
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmin.s16 q8, q8, q15
+ vmin.s16 q9, q9, q15
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+
+ vpop {q4-q5}
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d16[], d17[]}, [r2, :32]
+ vdup.32 d4, r12
+ vst1.32 {d28[0]}, [r2, :32]
+ vqrdmulh.s32 q8, q8, d4[0]
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d20, q8
+ vqmovn.s32 d21, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqrdmulh.s16 q10, q10, d4[1]
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q8, q10, #4
+ vld1.16 {d3}, [r0, :64], r1
+ vrshr.s16 q9, q10, #4
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4s_x4_neon
+ movrel r5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, q2, #12 // t4a
+ vrshr.s32 \r7, q3, #12 // t7a
+ vrshr.s32 \r3, q6, #12 // t5a
+ vrshr.s32 \r5, q7, #12 // t6a
+
+ vqadd.s32 q2, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 q3, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 q7, q7, #12 // t5
+ vrshr.s32 q5, q6, #12 // t6
+
+ vqsub.s32 \r7, \r0, q3 // out7
+ vqadd.s32 \r0, \r0, q3 // out0
+ vqadd.s32 \r1, \r2, q5 // out1
+ vqsub.s32 q6, \r2, q5 // out6
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
+ vqadd.s32 \r3, \r6, q2 // out3
+ vqsub.s32 \r4, \r6, q2 // out4
+ vmov \r6, q6 // out6
+.endm
+
+.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_2s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, d4, #12 // t4a
+ vrshr.s32 \r7, d5, #12 // t7a
+ vrshr.s32 \r3, d6, #12 // t5a
+ vrshr.s32 \r5, d7, #12 // t6a
+
+ vqadd.s32 d4, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 d5, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 d6, d6, #12 // t5
+ vrshr.s32 d7, d7, #12 // t6
+
+ vqsub.s32 \r7, \r0, d5 // out7
+ vqadd.s32 \r0, \r0, d5 // out0
+ vqadd.s32 \r1, \r2, d7 // out1
+ vqsub.s32 d7, \r2, d7 // out6
+ vqadd.s32 \r2, \r4, d6 // out2
+ vqsub.s32 \r5, \r4, d6 // out5
+ vqadd.s32 \r3, \r6, d4 // out3
+ vqsub.s32 \r4, \r6, d4 // out4
+ vmov \r6, d7 // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]
+ idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla q2, q15, q8, d0[0], d0[1]
+ vmul_vmls q3, q15, q8, d0[1], d0[0]
+ vmul_vmla q4, q13, q10, d1[0], d1[1]
+ vrshr.s32 q8, q2, #12 // t0a
+ vrshr.s32 q15, q3, #12 // t1a
+ vmul_vmls q5, q13, q10, d1[1], d1[0]
+ vmul_vmla q6, q11, q12, d2[0], d2[1]
+ vrshr.s32 q10, q4, #12 // t2a
+ vrshr.s32 q13, q5, #12 // t3a
+ vmul_vmls q7, q11, q12, d2[1], d2[0]
+ vmul_vmla q2, q9, q14, d3[0], d3[1]
+ vrshr.s32 q12, q6, #12 // t4a
+ vrshr.s32 q11, q7, #12 // t5a
+ vmul_vmls q3, q9, q14, d3[1], d3[0]
+ vrshr.s32 q14, q2, #12 // t6a
+ vrshr.s32 q9, q3, #12 // t7a
+
+ vld1.32 {q0}, [r12]
+
+ vqadd.s32 q2, q8, q12 // t0
+ vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vqadd.s32 q4, q15, q11 // t1
+ vqsub.s32 q5, q15, q11 // t5
+ vqadd.s32 q6, q10, q14 // t2
+ vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ vqadd.s32 q10, q13, q9 // t3
+ vqsub.s32 q11, q13, q9 // t7
+
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
+ vmul_vmla q8, q3, q5, d1[1], d1[0]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
+ vmul_vmls q14, q11, q7, d1[1], d1[0]
+
+ vrshr.s32 q3, q8, #12 // t4a
+ vrshr.s32 q5, q13, #12 // t5a
+
+ vmul_vmla q8, q11, q7, d1[0], d1[1]
+
+ vrshr.s32 q7, q14, #12 // t6a
+ vrshr.s32 q11, q8, #12 // t7a
+
+ vqadd.s32 \r0, q2, q6 // out0
+ vqsub.s32 q2, q2, q6 // t2
+ vqadd.s32 \r7, q4, q10 // out7
+ vqsub.s32 q4, q4, q10 // t3
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqadd.s32 \r1, q3, q7 // out1
+ vqsub.s32 q3, q3, q7 // t6
+ vqadd.s32 \r6, q5, q11 // out6
+ vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
+ vqneg.s32 \r1, \r1 // out1
+
+ vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
+ vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11)
+ vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10)
+ vrshr.s32 q2, q10, #12 // out3
+ vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13)
+ vrshr.s32 q3, q12, #12 // out5
+ vrshr.s32 \r2, q10, #12 // out2 (q10 or q13)
+ vrshr.s32 \r4, q6, #12 // out4 (q12 or q11)
+
+ vqneg.s32 \r3, q2 // out3
+ vqneg.s32 \r5, q3 // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x8_neon
+ vqshl.s32 q8, q8, #1
+ vqshl.s32 q9, q9, #1
+ vqshl.s32 q10, q10, #1
+ vqshl.s32 q11, q11, #1
+ vqshl.s32 q12, q12, #1
+ vqshl.s32 q13, q13, #1
+ vqshl.s32 q14, q14, #1
+ vqshl.s32 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_txfm_add_8x8_neon
+ vmov.i32 q0, #0
+ mov r7, #8*4
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ cmp r3, r10
+ transpose_4x8h q8, q9, q10, q11
+
+ blt 1f
+
+ sub r2, r2, r7, lsl #3
+ vpush {q8-q11}
+
+ add r2, r2, #16
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d31, q15, #1
+ vqrshrn.s32 d30, q11, #1
+ vqrshrn.s32 d29, q14, #1
+ vqrshrn.s32 d28, q10, #1
+ vqrshrn.s32 d27, q13, #1
+ vqrshrn.s32 d26, q9, #1
+ vqrshrn.s32 d25, q12, #1
+ vqrshrn.s32 d24, q8, #1
+ vpop {q8-q11}
+
+ transpose_4x8h q12, q13, q14, q15
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+
+2:
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ mov r10, #\eob_half
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vdup.32 d4, r12
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+
+ scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ vqmovn.s32 d20, q12
+ vqmovn.s32 d21, q13
+ vqmovn.s32 d22, q14
+ vqmovn.s32 d23, q15
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ cmp r3, r10
+ mov r7, #32
+ blt 1f
+
+ add r2, r2, #16
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ scale_input d2[0], q8, q9, q10, q11
+ sub r2, r2, r7, lsl #2
+
+ blx r4
+
+ sub r2, r2, #16
+
+ vqmovn.s32 d24, q8
+ vqmovn.s32 d25, q9
+ vqmovn.s32 d26, q10
+ vqmovn.s32 d27, q11
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+
+2:
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+ scale_input d2[0], q8, q9, q10, q11
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ vmov q10, q12
+ vmov q11, q13
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov r10, #\eob_half
+.endif
+ movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_2s_x16_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #32
+
+ vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a
+ vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a
+ vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a
+ vrshr.s32 d17, d4, #12 // t8a
+ vrshr.s32 d31, d5, #12 // t15a
+ vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a
+ vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a
+ vrshr.s32 d23, d6, #12 // t9a
+ vrshr.s32 d25, d4, #12 // t14a
+ vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a
+ vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a
+ vrshr.s32 d21, d5, #12 // t10a
+ vrshr.s32 d27, d6, #12 // t13a
+ vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a
+ vrshr.s32 d19, d4, #12 // t11a
+ vrshr.s32 d29, d5, #12 // t12a
+
+ vld1.32 {q0}, [r12, :128]
+
+ vqsub.s32 d4, d17, d23 // t9
+ vqadd.s32 d17, d17, d23 // t8
+ vqsub.s32 d5, d31, d25 // t14
+ vqadd.s32 d31, d31, d25 // t15
+ vqsub.s32 d23, d19, d21 // t10
+ vqadd.s32 d19, d19, d21 // t11
+ vqadd.s32 d25, d29, d27 // t12
+ vqsub.s32 d29, d29, d27 // t13
+
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
+ vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
+ vrshr.s32 d21, d6, #12 // t9a
+ vrshr.s32 d27, d7, #12 // t14a
+
+ vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a
+ vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a
+ vrshr.s32 d29, d6, #12 // t13a
+ vneg.s32 d7, d7
+ vrshr.s32 d23, d7, #12 // t10a
+
+ vqsub.s32 d4, d17, d19 // t11a
+ vqadd.s32 d17, d17, d19 // t8a
+ vqsub.s32 d5, d31, d25 // t12a
+ vqadd.s32 d31, d31, d25 // t15a
+ vqadd.s32 d19, d21, d23 // t9
+ vqsub.s32 d21, d21, d23 // t10
+ vqsub.s32 d25, d27, d29 // t13
+ vqadd.s32 d27, d27, d29 // t14
+
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
+ vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
+ vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
+
+ vrshr.s32 d6, d6, #12 // t11
+ vrshr.s32 d7, d7, #12 // t12
+ vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a
+ vrshr.s32 d4, d4, #12 // t10a
+ vrshr.s32 d5, d5, #12 // t13a
+
+ vqadd.s32 d8, d16, d31 // out0
+ vqsub.s32 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s32 d23, d30, d17 // out7
+ vqsub.s32 d9, d30, d17 // out8
+ vqadd.s32 d17, d18, d27 // out1
+ vqsub.s32 d30, d18, d27 // out14
+ vqadd.s32 d18, d20, d5 // out2
+ vqsub.s32 d29, d20, d5 // out13
+ vqadd.s32 d5, d28, d19 // out6
+ vqsub.s32 d25, d28, d19 // out9
+ vqadd.s32 d19, d22, d7 // out3
+ vqsub.s32 d28, d22, d7 // out12
+ vqadd.s32 d20, d24, d6 // out4
+ vqsub.s32 d27, d24, d6 // out11
+ vqadd.s32 d21, d26, d4 // out5
+ vqsub.s32 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0
+ vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1
+ vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2
+ vrshr.s32 d16, d4, #12 // t0
+ vrshr.s32 d31, d6, #12 // t1
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3
+ vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4
+ vrshr.s32 d18, d8, #12 // t2
+ vrshr.s32 d29, d4, #12 // t3
+ vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5
+ vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6
+ vrshr.s32 d20, d6, #12 // t4
+ vrshr.s32 d27, d8, #12 // t5
+ vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7
+ vld1.32 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+ vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8
+ vrshr.s32 d22, d4, #12 // t6
+ vrshr.s32 d25, d6, #12 // t7
+ vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9
+ vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10
+ vrshr.s32 d23, d8, #12 // t8
+ vrshr.s32 d24, d4, #12 // t9
+ vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11
+ vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12
+ vrshr.s32 d21, d6, #12 // t10
+ vrshr.s32 d26, d8, #12 // t11
+ vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13
+ vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14
+ vrshr.s32 d19, d4, #12 // t12
+ vrshr.s32 d28, d6, #12 // t13
+ vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15
+ vrshr.s32 d17, d8, #12 // t14
+ vrshr.s32 d30, d4, #12 // t15
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d23 // t8a
+ vqadd.s32 d16, d16, d23 // t0a
+ vqsub.s32 d7, d31, d24 // t9a
+ vqadd.s32 d31, d31, d24 // t1a
+ vqadd.s32 d23, d18, d21 // t2a
+ vqsub.s32 d18, d18, d21 // t10a
+ vqadd.s32 d24, d29, d26 // t3a
+ vqsub.s32 d29, d29, d26 // t11a
+ vqadd.s32 d21, d20, d19 // t4a
+ vqsub.s32 d20, d20, d19 // t12a
+ vqadd.s32 d26, d27, d28 // t5a
+ vqsub.s32 d27, d27, d28 // t13a
+ vqadd.s32 d19, d22, d17 // t6a
+ vqsub.s32 d22, d22, d17 // t14a
+ vqadd.s32 d28, d25, d30 // t7a
+ vqsub.s32 d25, d25, d30 // t15a
+
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
+ vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
+ vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
+ vrshr.s32 d17, d4, #12 // t8
+ vrshr.s32 d30, d6, #12 // t9
+ vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11
+ vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12
+ vrshr.s32 d18, d8, #12 // t10
+ vrshr.s32 d29, d4, #12 // t11
+ vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13
+ vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14
+ vrshr.s32 d27, d6, #12 // t12
+ vrshr.s32 d20, d8, #12 // t13
+ vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15
+ vrshr.s32 d25, d4, #12 // t14
+ vrshr.s32 d22, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t4
+ vqadd.s32 d16, d16, d21 // t0
+ vqsub.s32 d3, d31, d26 // t5
+ vqadd.s32 d31, d31, d26 // t1
+ vqadd.s32 d21, d23, d19 // t2
+ vqsub.s32 d23, d23, d19 // t6
+ vqadd.s32 d26, d24, d28 // t3
+ vqsub.s32 d24, d24, d28 // t7
+ vqadd.s32 d19, d17, d27 // t8a
+ vqsub.s32 d17, d17, d27 // t12a
+ vqadd.s32 d28, d30, d20 // t9a
+ vqsub.s32 d30, d30, d20 // t13a
+ vqadd.s32 d27, d18, d25 // t10a
+ vqsub.s32 d18, d18, d25 // t14a
+ vqadd.s32 d20, d29, d22 // t11a
+ vqsub.s32 d29, d29, d22 // t15a
+
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
+ vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
+ vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
+ vrshr.s32 d22, d4, #12 // t4a
+ vrshr.s32 d25, d6, #12 // t5a
+ vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a
+ vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12
+ vrshr.s32 d24, d8, #12 // t6a
+ vrshr.s32 d23, d4, #12 // t7a
+ vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14
+ vrshr.s32 d17, d6, #12 // t12
+ vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15
+ vrshr.s32 d29, d8, #12 // t13
+ vrshr.s32 d30, d4, #12 // t14
+ vrshr.s32 d18, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s32 \o0, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+.else
+ vqadd.s32 d4, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+
+ vqsub.s32 d3, d29, d18 // t15a
+ vqadd.s32 \o13,d29, d18 // out13
+ vqadd.s32 \o2, d17, d30 // out2
+ vqsub.s32 d26, d17, d30 // t14a
+
+ vqadd.s32 \o1, d19, d27 // out1
+ vqsub.s32 d27, d19, d27 // t10
+ vqadd.s32 \o14,d28, d20 // out14
+ vqsub.s32 d20, d28, d20 // t11
+
+ vqadd.s32 \o3, d22, d24 // out3
+ vqsub.s32 d22, d22, d24 // t6
+ vqadd.s32 \o12,d25, d23 // out12
+ vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
+ vqneg.s32 \o3, \o3 // out3
+
+ vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vrshr.s32 d24, d24, #12 // out8
+ vrshr.s32 d4, d4, #12 // out7
+ vrshr.s32 d5, d6, #12 // out5
+ vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vrshr.s32 d26, d8, #12 // out10
+
+ vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vrshr.s32 \o4, d2, #12 // out4
+ vrshr.s32 d7, d6, #12 // out9
+ vrshr.s32 d6, d8, #12 // out11
+ vrshr.s32 \o6, d22, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s32 \o7, d4 // out7
+ vqneg.s32 \o5, d5 // out5
+ vqneg.s32 \o11,d6 // out11
+ vqneg.s32 \o9, d7 // out9
+.endm
+
+function inv_adst_2s_x16_neon
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_2s_x16_neon
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_2s_x16_neon
+ mov r12, #0
+ movt r12, #2*(5793-4096)*8
+ vdup.32 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q1, \i, d0[0]
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vrshr.s32 q2, q2, #1
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x4 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ blx r4
+ vqrshrn.s32 d16, q8, #\shift
+ vqrshrn.s32 d17, q9, #\shift
+ vqrshrn.s32 d18, q10, #\shift
+ vqrshrn.s32 d19, q11, #\shift
+ vqrshrn.s32 d20, q12, #\shift
+ vqrshrn.s32 d21, q13, #\shift
+ vqrshrn.s32 d22, q14, #\shift
+ vqrshrn.s32 d23, q15, #\shift
+ vuzp.16 q8, q9
+ vuzp.16 q10, q11
+
+.irp i, q8, q10, q9, q11
+ vst1.16 {\i}, [r6, :128]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 3, 10, 21, 36, 55, 78, 105, 256
+endconst
+
+const eob_16x16_identity
+ .short 2, 4, 6, 8, 10, 12, 14, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ cmp r3, r10
+ mov r11, #16
+ blt 1f
+
+ add r6, r2, #8
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r6, :64]
+ vst1.32 {d4}, [r6, :64], r11
+.endr
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+ vpush {q8-q11}
+
+ b 2f
+
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ mov r6, sp
+ vpush {q8-q9}
+ vpush {q8-q9}
+
+2:
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r2, :64]
+ vst1.32 {d4}, [r2, :64], r11
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+
+ vmov q12, q10
+ vmov q13, q11
+
+ vpop {q10-q11}
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+ vpop {q10-q11}
+ vmov q8, q12
+ vmov q9, q13
+ blx r5
+ add r6, r0, #16
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh r9, [r10, #4]
+
+ mov r11, #64
+ cmp r3, r9
+ ldrh r9, [r10, #2]
+ blt 1f
+
+ add r6, r2, #48
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d28, q8, #1
+ vqrshrn.s32 d29, q9, #1
+ vqrshrn.s32 d30, q10, #1
+ vqrshrn.s32 d31, q11, #1
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ b 2f
+1:
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ cmp r3, r9
+ ldrh r9, [r10]
+ blt 1f
+
+ add r6, r2, #32
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d24, q8, #1
+ vqrshrn.s32 d25, q9, #1
+ vqrshrn.s32 d26, q10, #1
+ vqrshrn.s32 d27, q11, #1
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+2:
+ cmp r3, r9
+ blt 1f
+
+ add r6, r2, #16
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ b 2f
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+2:
+ vmov.i16 q2, #0
+ vpush {q8-q9}
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ vpop {q10-q11}
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16
+.else
+ movrel_local r10, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16_identity2
+.else
+ movrel_local r10, eob_4x16
+.endif
+.endif
+.else
+ mov r10, #\eob_16x4
+ movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 3
+def_fn_416 \w, \h, identity, identity, 3
+def_fn_416 \w, \h, dct, adst, 3
+def_fn_416 \w, \h, dct, flipadst, 3
+def_fn_416 \w, \h, dct, identity, 2
+def_fn_416 \w, \h, adst, dct, 3
+def_fn_416 \w, \h, adst, adst, 3
+def_fn_416 \w, \h, adst, flipadst, 3
+def_fn_416 \w, \h, flipadst, dct, 3
+def_fn_416 \w, \h, flipadst, adst, 3
+def_fn_416 \w, \h, flipadst, flipadst, 3
+def_fn_416 \w, \h, identity, dct, 2
+def_fn_416 \w, \h, adst, identity, 2
+def_fn_416 \w, \h, flipadst, identity, 2
+def_fn_416 \w, \h, identity, adst, 2
+def_fn_416 \w, \h, identity, flipadst, 2
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+function inv_txfm_add_16x8_neon
+ sub_sp_align 256
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(8 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #8*4
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i*2)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_8x16_neon
+ add r10, r10, #2
+ sub_sp_align 256
+ ldrh r11, [r10], #4
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #4
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q2, #0
+ vdup.32 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\j}, [r7, :128]
+ vst1.32 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+.irp j, d16, d20, d17, d21, d18, d22, d19, d23
+ vst1.16 {\j}, [r6, :64]!
+.endr
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_8x16
+ .short 3, 10, 21, 43, 59, 75, 91, 128
+endconst
+
+const eob_8x16_identity1
+ .short 2, 4, 6, 64, 80, 96, 112, 128
+endconst
+
+const eob_8x16_identity2
+ .short 2, 4, 6, 8, 10, 12, 14, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.else
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+.endif
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16
+.else
+ movrel_local r10, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16_identity2
+.else
+ movrel_local r10, eob_8x16
+.endif
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_2s_x16_neon
+ movrel_local r12, idct_coeffs, 4*16
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a
+ vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a
+ vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a
+ vrshr.s32 d16, d4, #12 // t16a
+ vrshr.s32 d31, d6, #12 // t31a
+ vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a
+ vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a
+ vrshr.s32 d24, d8, #12 // t17a
+ vrshr.s32 d23, d4, #12 // t30a
+ vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a
+ vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a
+ vrshr.s32 d20, d6, #12 // t18a
+ vrshr.s32 d27, d8, #12 // t29a
+ vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #4*24
+ vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a
+ vrshr.s32 d28, d4, #12 // t19a
+ vrshr.s32 d19, d6, #12 // t28a
+ vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a
+ vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a
+ vrshr.s32 d18, d8, #12 // t20a
+ vrshr.s32 d29, d4, #12 // t27a
+ vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a
+ vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a
+ vrshr.s32 d26, d6, #12 // t21a
+ vrshr.s32 d21, d8, #12 // t26a
+ vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a
+ vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a
+ vrshr.s32 d22, d4, #12 // t22a
+ vrshr.s32 d25, d6, #12 // t25a
+ vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a
+ vrshr.s32 d30, d8, #12 // t23a
+ vrshr.s32 d17, d4, #12 // t24a
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d24 // t17
+ vqadd.s32 d16, d16, d24 // t16
+ vqsub.s32 d7, d31, d23 // t30
+ vqadd.s32 d31, d31, d23 // t31
+ vqsub.s32 d24, d28, d20 // t18
+ vqadd.s32 d28, d28, d20 // t19
+ vqadd.s32 d23, d18, d26 // t20
+ vqsub.s32 d18, d18, d26 // t21
+ vqsub.s32 d20, d30, d22 // t22
+ vqadd.s32 d30, d30, d22 // t23
+ vqadd.s32 d26, d17, d25 // t24
+ vqsub.s32 d17, d17, d25 // t25
+ vqsub.s32 d22, d29, d21 // t26
+ vqadd.s32 d29, d29, d21 // t27
+ vqadd.s32 d25, d19, d27 // t28
+ vqsub.s32 d19, d19, d27 // t29
+
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
+ vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
+ vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
+ vrshr.s32 d21, d4, #12 // t17a
+ vrshr.s32 d27, d6, #12 // t30a
+ vneg.s32 d8, d8 // -> t18a
+ vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a
+ vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a
+ vrshr.s32 d19, d8, #12 // t18a
+ vrshr.s32 d24, d5, #12 // t29a
+ vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a
+ vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a
+ vrshr.s32 d22, d4, #12 // t21a
+ vrshr.s32 d18, d6, #12 // t26a
+ vneg.s32 d8, d8 // -> t22a
+ vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a
+ vrshr.s32 d17, d8, #12 // t22a
+ vrshr.s32 d20, d5, #12 // t25a
+
+ vqsub.s32 d2, d27, d24 // t29
+ vqadd.s32 d27, d27, d24 // t30
+ vqsub.s32 d3, d21, d19 // t18
+ vqadd.s32 d21, d21, d19 // t17
+ vqsub.s32 d24, d16, d28 // t19a
+ vqadd.s32 d16, d16, d28 // t16a
+ vqsub.s32 d19, d30, d23 // t20a
+ vqadd.s32 d30, d30, d23 // t23a
+ vqsub.s32 d28, d17, d22 // t21
+ vqadd.s32 d17, d17, d22 // t22
+ vqadd.s32 d23, d26, d29 // t24a
+ vqsub.s32 d26, d26, d29 // t27a
+ vqadd.s32 d22, d20, d18 // t25
+ vqsub.s32 d20, d20, d18 // t26
+ vqsub.s32 d29, d31, d25 // t28a
+ vqadd.s32 d31, d31, d25 // t31a
+
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
+ vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
+ vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
+ vrshr.s32 d18, d4, #12 // t18a
+ vrshr.s32 d25, d6, #12 // t29a
+ vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28
+ vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20
+ vrshr.s32 d29, d8, #12 // t19
+ vrshr.s32 d24, d5, #12 // t28
+ vneg.s32 d4, d4 // -> t20
+ vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27
+ vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a
+ vrshr.s32 d26, d4, #12 // t20
+ vrshr.s32 d19, d6, #12 // t27
+ vneg.s32 d8, d8 // -> t21a
+ vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a
+ vrshr.s32 d20, d8, #12 // t21a
+ vrshr.s32 d28, d5, #12 // t26a
+
+ vqsub.s32 d2, d16, d30 // t23
+ vqadd.s32 d16, d16, d30 // t16 = out16
+ vqsub.s32 d3, d31, d23 // t24
+ vqadd.s32 d31, d31, d23 // t31 = out31
+ vqsub.s32 d23, d21, d17 // t22a
+ vqadd.s32 d17, d21, d17 // t17a = out17
+ vqadd.s32 d30, d27, d22 // t30a = out30
+ vqsub.s32 d21, d27, d22 // t25a
+ vqsub.s32 d27, d18, d20 // t21
+ vqadd.s32 d18, d18, d20 // t18 = out18
+ vqadd.s32 d4, d29, d26 // t19a = out19
+ vqsub.s32 d26, d29, d26 // t20a
+ vqadd.s32 d29, d25, d28 // t29 = out29
+ vqsub.s32 d25, d25, d28 // t26
+ vqadd.s32 d28, d24, d19 // t28a = out28
+ vqsub.s32 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
+ vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
+ vrshr.s32 d20, d4, #12 // t20
+ vrshr.s32 d22, d6, #12 // t27
+
+ vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a
+ vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vrshr.s32 d26, d4, #12 // t26a
+
+ vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22
+ vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25
+ vrshr.s32 d21, d6, #12 // t21a
+ vrshr.s32 d22, d24, #12 // t22
+ vrshr.s32 d25, d4, #12 // t25
+
+ vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a
+ vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a
+ vrshr.s32 d23, d4, #12 // t23a
+ vrshr.s32 d24, d6, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d24, d25
+ vtrn.32 d26, d27
+ vtrn.32 d28, d29
+ vtrn.32 d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+.endm
+ store1 d16, d18, d20, d22
+ store1 d24, d26, d28, d30
+ store1 d17, d19, d21, d23
+ store1 d25, d27, d29, d31
+.purgem store1
+ sub r6, r6, #64*2
+
+ vmov.i32 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_2s_x16_neon
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+ vtrn.32 d23, d22
+ vtrn.32 d21, d20
+ vtrn.32 d19, d18
+ vtrn.32 d17, d16
+.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift
+ vld1.32 {q0, q1}, [r6, :128]!
+ vld1.32 {q2, q3}, [r6, :128]
+ sub r6, r6, #32
+ vqsub.s32 d15, d0, \r0
+ vqadd.s32 d0, d0, \r0
+ vqsub.s32 d14, d1, \r1
+ vqadd.s32 d1, d1, \r1
+ vqsub.s32 d13, d2, \r2
+ vqadd.s32 d2, d2, \r2
+ vqsub.s32 d12, d3, \r3
+ vqadd.s32 d3, d3, \r3
+ vqsub.s32 d11, d4, \r4
+ vqadd.s32 d4, d4, \r4
+ vqsub.s32 d10, d5, \r5
+ vqadd.s32 d5, d5, \r5
+ vqsub.s32 d9, d6, \r6
+ vqadd.s32 d6, d6, \r6
+ vqsub.s32 d8, d7, \r7
+ vqadd.s32 d7, d7, \r7
+ vqrshrn.s32 d0, q0, #\shift
+ vqrshrn.s32 d1, q1, #\shift
+ vqrshrn.s32 d2, q2, #\shift
+ vqrshrn.s32 d3, q3, #\shift
+ vqrshrn.s32 d4, q4, #\shift
+ vqrshrn.s32 d5, q5, #\shift
+ vqrshrn.s32 d6, q6, #\shift
+ vqrshrn.s32 d7, q7, #\shift
+ vrev32.16 q2, q2
+ vrev32.16 q3, q3
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift
+ store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl X(inv_dct_4h_x16_neon)
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl X(inv_dct32_odd_4h_x16_neon)
+
+ neg r9, r8
+ mov r10, r6
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.16 {d0}, [r10, :64], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.16 {d1}, [r10, :64], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.16 {d2}, [r10, :64], r1
+ \op\().s16 d5, d5, \r1
+ vld1.16 {d3}, [r10, :64], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vqadd.s16 q0, q0, q2
+ \op\().s16 d7, d7, \r3
+ vmax.s16 q0, q0, q6
+ vrshr.s16 q3, q3, #4
+ vmin.s16 q0, q0, q7
+ vqadd.s16 q1, q1, q3
+ vst1.16 {d0}, [r6, :64], r1
+ vmax.s16 q1, q1, q6
+ vst1.16 {d1}, [r6, :64], r1
+ vmin.s16 q1, q1, q7
+ vst1.16 {d2}, [r6, :64], r1
+ vst1.16 {d3}, [r6, :64], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024
+endconst
+
+const eob_16x32
+ .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512
+endconst
+
+const eob_16x32_shortside
+ .short 3, 10, 21, 36, 55, 78, 105, 512
+endconst
+
+const eob_8x32
+ .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q6-q7}
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #4*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 6
+2:
+ vmov.i32 q0, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r6
+.endr
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+ load_add_store_8x4 r0, r7, shiftbits=2
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q6-q7}
+ mov r9, #0
+ mov_const r8, 2896*8*(1<<16)
+ movt r9, #2*(5793-4096)*8
+ movrel_local r5, eob_16x32\hshort, 2
+
+ mov r6, #4*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 6
+2:
+ vdup.i32 d0, r8
+ vmov.i32 q1, #0
+ vmov.32 d0[1], r9
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x4_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s32, 1
+ identity_8x4 d0[1]
+.endif
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+.if \w == 16
+ load_add_store_8x4 r0, r7, shiftbits=2
+.else
+ load_add_store_8x4 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q6-q7}
+ movrel_local r4, eob_8x32, 2
+
+ mov r12, #4*\h
+1:
+ ldrh lr, [r4], #4
+.if \w == 8
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r12
+.endr
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ transpose_4x8h q8, q9, q10, q11
+
+ cmp r3, lr
+ load_add_store_8x4 r0, r5, shiftbits=2
+ blt 9f
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #4*4
+.else
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q12, q13}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q14, q15}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q10
+ vqmovn.s32 d20, q9
+ vqmovn.s32 d21, q11
+ vqmovn.s32 d18, q12
+ vqmovn.s32 d19, q14
+ vqmovn.s32 d22, q13
+ vqmovn.s32 d23, q15
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+
+ cmp r3, lr
+ load_add_store_4x8 r0, r5, shiftbits=3
+ blt 9f
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2*4
+.endif
+ b 1b
+
+9:
+ vpop {q6-q7}
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_2s_x16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*32
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel r5, X(inv_dct_4h_x16_neon)
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*16
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32, 2
+
+ mov r8, #4*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #4
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #4
+ add r2, r2, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ vqrshrn.s32 d16, q8, #2
+ vqrshrn.s32 d18, q9, #2
+ vqrshrn.s32 d20, q10, #2
+ vqrshrn.s32 d22, q11, #2
+ vqrshrn.s32 d17, q12, #2
+ vqrshrn.s32 d19, q13, #2
+ vqrshrn.s32 d21, q14, #2
+ vqrshrn.s32 d23, q15, #2
+
+ transpose_4x8h q8, q9, q10, q11
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r10, eob_8x32
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ cmp r3, r11
+ mov r8, #(8 - \i)
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #8*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9, lsl #1
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vqrdmulh.s32 d23, d16, d0[1] // t63a
+ vqrdmulh.s32 d16, d16, d0[0] // t32a
+ vqrdmulh.s32 d22, d17, d1[0] // t62a
+ vqrdmulh.s32 d17, d17, d1[1] // t33a
+ vqrdmulh.s32 d21, d18, d2[1] // t61a
+ vqrdmulh.s32 d18, d18, d2[0] // t34a
+ vqrdmulh.s32 d20, d19, d3[0] // t60a
+ vqrdmulh.s32 d19, d19, d3[1] // t35a
+
+ vld1.32 {q0}, [r12, :128]!
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t33
+ vqsub.s32 d26, d19, d18 // t34
+ vqadd.s32 d27, d19, d18 // t35
+ vqadd.s32 d28, d20, d21 // t60
+ vqsub.s32 d29, d20, d21 // t61
+ vqsub.s32 d30, d23, d22 // t62
+ vqadd.s32 d31, d23, d22 // t63
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
+ vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
+ vneg.s32 d4, d4 // t34a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
+ vrshr.s32 d26, d4, #12 // t34a
+ vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
+ vrshr.s32 d29, d6, #12 // t61a
+ vrshr.s32 d25, d7, #12 // t33a
+ vrshr.s32 d30, d4, #12 // t62a
+
+ vqadd.s32 d16, d24, d27 // t32a
+ vqsub.s32 d19, d24, d27 // t35a
+ vqadd.s32 d17, d25, d26 // t33
+ vqsub.s32 d18, d25, d26 // t34
+ vqsub.s32 d20, d31, d28 // t60a
+ vqadd.s32 d23, d31, d28 // t63a
+ vqsub.s32 d21, d30, d29 // t61
+ vqadd.s32 d22, d30, d29 // t62
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
+ vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
+ vrshr.s32 d21, d4, #12 // t61a
+ vrshr.s32 d18, d6, #12 // t34a
+ vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
+ vrshr.s32 d20, d7, #12 // t60
+ vrshr.s32 d19, d4, #12 // t35
+
+ vst1.32 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.32 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0}, [r12, :128]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #4*2*0] // t32a
+ vldr d17, [r9, #4*2*8] // t39a
+ vldr d18, [r9, #4*2*0] // t63a
+ vldr d19, [r6, #4*2*8] // t56a
+ vldr d20, [r6, #4*2*16] // t40a
+ vldr d21, [r9, #4*2*24] // t47a
+ vldr d22, [r9, #4*2*16] // t55a
+ vldr d23, [r6, #4*2*24] // t48a
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t39
+ vqadd.s32 d26, d18, d19 // t63
+ vqsub.s32 d27, d18, d19 // t56
+ vqsub.s32 d28, d21, d20 // t40
+ vqadd.s32 d29, d21, d20 // t47
+ vqadd.s32 d30, d23, d22 // t48
+ vqsub.s32 d31, d23, d22 // t55
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
+ vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
+ vrshr.s32 d25, d4, #12 // t56a
+ vrshr.s32 d27, d6, #12 // t39a
+ vneg.s32 d7, d7 // t40a
+ vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
+ vrshr.s32 d31, d7, #12 // t40a
+ vrshr.s32 d28, d4, #12 // t55a
+
+ vqadd.s32 d16, d24, d29 // t32a
+ vqsub.s32 d19, d24, d29 // t47a
+ vqadd.s32 d17, d27, d31 // t39
+ vqsub.s32 d18, d27, d31 // t40
+ vqsub.s32 d20, d26, d30 // t48a
+ vqadd.s32 d23, d26, d30 // t63a
+ vqsub.s32 d21, d25, d28 // t55
+ vqadd.s32 d22, d25, d28 // t56
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
+ vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
+ vrshr.s32 d18, d4, #12 // t40a
+ vrshr.s32 d21, d6, #12 // t55a
+ vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
+ vrshr.s32 d19, d7, #12 // t47
+ vrshr.s32 d20, d4, #12 // t48
+
+ vstr d16, [r6, #4*2*0] // t32a
+ vstr d17, [r9, #4*2*0] // t39
+ vstr d18, [r6, #4*2*8] // t40a
+ vstr d19, [r9, #4*2*8] // t47
+ vstr d20, [r6, #4*2*16] // t48
+ vstr d21, [r9, #4*2*16] // t55a
+ vstr d22, [r6, #4*2*24] // t56
+ vstr d23, [r9, #4*2*24] // t63a
+
+ add r6, r6, #4*2
+ sub r9, r9, #4*2
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.32 {\i}, [\src, :64]
+ vst1.32 {\zero}, [\src, :64], \strd
+.else
+ vld1.32 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.32 {q8, q9}, [\dst, :128]!
+ vst1.32 {q10, q11}, [\dst, :128]!
+ vst1.32 {q12, q13}, [\dst, :128]!
+ vst1.32 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i32 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i32 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov_const \gpr, \val
+ vdup.32 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.32 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_2s_x64_neon
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ store16 r6
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_2s_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.macro store_addsub r0, r1, r2, r3
+ vld1.32 {d2}, [r6, :64]!
+ vld1.32 {d3}, [r6, :64]!
+ vqadd.s32 d6, d2, \r0
+ vqsub.s32 \r0, d2, \r0
+ vld1.32 {d4}, [r6, :64]!
+ vqadd.s32 d7, d3, \r1
+ vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
+ vld1.32 {d5}, [r6, :64]!
+ vqadd.s32 d2, d4, \r2
+ sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
+ vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
+ vst1.32 {d6}, [r6, :64]!
+ vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
+ vqadd.s32 d3, d5, \r3
+ vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
+ vst1.32 {d7}, [r6, :64]!
+ vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
+ vst1.32 {d2}, [r6, :64]!
+ vst1.32 {\r2}, [r10, :64], r9
+ vst1.32 {d3}, [r6, :64]!
+ vst1.32 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.32 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.32 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.32 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.32 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.32 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.32 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.32 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.32 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.32 d16, [r10, :64] // in5 (offset 2)
+ vld1.32 d17, [r11, :64] // in27 (offset 13)
+ vld1.32 d18, [r9, :64] // in21 (offset 10)
+ vld1.32 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.32 d16, [r10, :64] // in3 (offset 1)
+ vld1.32 d17, [r11, :64] // in29 (offset 14)
+ vld1.32 d18, [r9, :64] // in19 (offset 9)
+ vld1.32 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x2_neon
+ vdup.32 q4, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.32 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.32 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.32 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.32 {d24, d25, d26, d27}, [r8, :128], r11
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+
+.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7
+ vqsub.s32 d7, \src0, \src1
+ vqsub.s32 d6, \src2, \src3
+ vqsub.s32 d5, \src4, \src5
+ vqsub.s32 d4, \src6, \src7
+ vqadd.s32 d0, \src0, \src1
+ vqadd.s32 d1, \src2, \src3
+ vqadd.s32 d2, \src4, \src5
+ vqadd.s32 d3, \src6, \src7
+ vrshl.s32 q3, q3, q4
+ vrshl.s32 q2, q2, q4
+ vrshl.s32 q0, q0, q4
+ vrshl.s32 q1, q1, q4
+ vqmovn.s32 d7, q3
+ vqmovn.s32 d6, q2
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d1, q1
+ vrev32.16 q3, q3
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q3}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d18, d29, d20, d27, d22, d25
+ store_addsub d17, d30, d19, d28, d21, d26, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #1
+ sub r9, r9, r10, lsl #1
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.16 {d0}, [r6, :64], r1
+ vld1.16 {d1}, [r9, :64], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.16 {d2}, [r6, :64]
+ vqsub.s16 d5, \src0, \src1
+ vld1.16 {d3}, [r9, :64]
+ vqadd.s16 d6, \src2, \src3
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vqadd.s16 q2, q2, q0
+ vqadd.s16 q3, q3, q1
+ vmax.s16 q2, q2, q6
+ vmax.s16 q3, q3, q6
+ vmin.s16 q2, q2, q7
+ vmin.s16 q3, q3, q7
+ vst1.16 {d4}, [r6, :64], r1
+ vst1.16 {d5}, [r9, :64], r10
+ vst1.16 {d6}, [r6, :64], r1
+ vst1.16 {d7}, [r9, :64], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_scale_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 8
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel r5, X(inv_dct_4h_x16_neon)
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_2s_x16_neon
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/loopfilter.S b/third_party/dav1d/src/arm/32/loopfilter.S
new file mode 100644
index 0000000000..97b960534f
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter.S
@@ -0,0 +1,868 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ vabd.u8 d0, d22, d23 // abs(p1 - p0)
+ vabd.u8 d1, d25, d24 // abs(q1 - q0)
+ vabd.u8 d2, d23, d24 // abs(p0 - q0)
+ vabd.u8 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u8 d4, d21, d22 // abs(p2 - p1)
+ vabd.u8 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u8 d6, d20, d21 // abs(p3 - p2)
+ vabd.u8 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u8 d4, d4, d5
+.endif
+ vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vshr.u8 d3, d3, #1
+.if \wd >= 8
+ vmax.u8 d4, d4, d6
+.endif
+.if \wd >= 6
+ vand d4, d4, d14
+.endif
+ vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u8 d4, d0, d4
+ vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm
+ vand d1, d1, d13 // fm && wd >= 4
+.if \wd >= 6
+ vand d14, d14, d1 // fm && wd > 4
+.endif
+.if \wd >= 16
+ vand d15, d15, d1 // fm && wd == 16
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i8 d10, #1
+ vabd.u8 d2, d21, d23 // abs(p2 - p0)
+ vabd.u8 d3, d22, d23 // abs(p1 - p0)
+ vabd.u8 d4, d25, d24 // abs(q1 - q0)
+ vabd.u8 d5, d26, d24 // abs(q2 - q0)
+.if \wd >= 8
+ vabd.u8 d6, d20, d23 // abs(p3 - p0)
+ vabd.u8 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u8 d2, d2, d3
+ vmax.u8 d4, d4, d5
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vmax.u8 d2, d2, d4
+.if \wd >= 8
+ vmax.u8 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u8 d3, d17, d23 // abs(p6 - p0)
+ vabd.u8 d4, d18, d23 // abs(p5 - p0)
+ vabd.u8 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u8 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u8 d6, d28, d24 // abs(q4 - q0)
+ vabd.u8 d7, d29, d24 // abs(q5 - q0)
+ vabd.u8 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u8 d3, d3, d4
+ vmax.u8 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u8 d7, d7, d8
+ vmax.u8 d3, d3, d5
+ vmax.u8 d3, d3, d7
+ vcge.u8 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vsubl.u8 q1, d22, d25 // p1 - q1
+ vcgt.u8 d0, d0, d12 // hev
+ vqmovn.s16 d2, q1
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vsubl.u8 q1, d24, d23
+ vmov.i16 q3, #3
+ vmul.i16 q1, q1, q3
+ vmov.i8 d6, #4
+ vaddw.s8 q1, q1, d4
+ vmov.i8 d7, #3
+ vqmovn.s16 d2, q1 // f
+ vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
+ vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
+ vshr.s8 d4, d4, #3 // f1
+ vshr.s8 d5, d5, #3 // f2
+ vmovl.u8 q1, d23 // p0
+ vmovl.u8 q3, d24 // q0
+ vaddw.s8 q1, q1, d5
+ vsubw.s8 q3, q3, d4
+ vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
+ vqmovun.s16 d2, q1 // out p0
+ vqmovun.s16 d6, q3 // out q0
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vmovl.u8 q1, d22 // p1
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vmovl.u8 q3, d25 // q1
+ vaddw.s8 q1, q1, d4
+ vsubw.s8 q3, q3, d4
+ vqmovun.s16 d2, q1 // out p1
+ vqmovun.s16 d6, q3 // out q1
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vaddl.u8 q0, d21, d21 // p2 * 2
+ vaddl.u8 q1, d21, d22 // p2 + p1
+ vaddl.u8 q2, d22, d23 // p1 + p0
+ vaddl.u8 q3, d23, d24 // p0 + q0
+ vadd.i16 q4, q0, q1
+ vadd.i16 q5, q2, q3
+ vaddl.u8 q6, d24, d25 // q0 + q1
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q0
+ vaddl.u8 q5, d25, d26 // q1 + q2
+ vrshrn.i16 d0, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q6
+ vsub.i16 q5, q5, q1
+ vaddl.u8 q6, d26, d26 // q2 + q2
+ vrshrn.i16 d1, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q2
+ vrshrn.i16 d2, q4, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 q4, q4, q6
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshrn.i16 d3, q4, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vaddl.u8 q0, d20, d21 // p3 + p2
+ vaddl.u8 q1, d22, d25 // p1 + q1
+ vaddl.u8 q2, d20, d22 // p3 + p1
+ vaddl.u8 q3, d23, d26 // p0 + q2
+ vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
+ vaddw.u8 q4, q4, d23 // + p0
+ vaddw.u8 q4, q4, d24 // + q0
+ vadd.i16 q4, q4, q2 // + p3 + p1
+ vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
+ vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
+ vrshrn.i16 d10, q4, #3 // out p2
+
+ vadd.i16 q4, q4, q1
+ vaddl.u8 q0, d20, d23 // p3 + p0
+ vaddl.u8 q1, d24, d27 // q0 + q3
+ vrshrn.i16 d11, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
+ vaddl.u8 q2, d21, d24 // p2 + q0
+ vaddl.u8 q3, d25, d27 // q1 + q3
+ vrshrn.i16 d12, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q1
+ vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
+ vaddl.u8 q0, d22, d25 // p1 + q1
+ vaddl.u8 q1, d26, d27 // q2 + q3
+ vrshrn.i16 d13, q4, #3 // out q0
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
+ vrshrn.i16 d0, q4, #3 // out q1
+
+ vadd.i16 q4, q4, q1
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshrn.i16 d1, q4, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vaddl.u8 q1, d17, d17 // p6 + p6
+ vaddl.u8 q2, d17, d18 // p6 + p5
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vadd.i16 q6, q1, q2
+ vadd.i16 q5, q3, q4
+ vaddl.u8 q3, d17, d21 // p6 + p2
+ vadd.i16 q6, q6, q5
+ vaddl.u8 q4, d17, d22 // p6 + p1
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vadd.i16 q3, q3, q4
+ vaddl.u8 q4, d19, d24 // p4 + q0
+ vadd.i16 q6, q6, q3
+ vadd.i16 q5, q5, q4
+ vaddl.u8 q3, d20, d25 // p3 + q1
+ vadd.i16 q6, q6, q5
+ vsub.i16 q3, q3, q1
+ vaddl.u8 q1, d21, d26 // p2 + q2
+ vrshrn.i16 d0, q6, #4 // out p5
+ vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 q1, q1, q2
+ vaddl.u8 q2, d22, d27 // p1 + q3
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vrshrn.i16 d1, q6, #4 // out p4
+ vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 q2, q2, q3
+ vaddl.u8 q3, d23, d28 // p0 + q4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vrshrn.i16 d2, q6, #4 // out p3
+ vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 q3, q3, q4
+ vaddl.u8 q4, d24, d29 // q0 + q5
+ vaddl.u8 q2, d17, d21 // p6 + p2
+ vrshrn.i16 d3, q6, #4 // out p2
+ vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 q4, q4, q2
+ vaddl.u8 q3, d25, d30 // q1 + q6
+ vaddl.u8 q5, d17, d22 // p6 + p1
+ vrshrn.i16 d4, q6, #4 // out p1
+ vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 q3, q3, q5
+ vaddl.u8 q4, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vrshrn.i16 d5, q6, #4 // out p0
+ vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 q4, q4, q5
+ vaddl.u8 q5, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vaddl.u8 q9, d19, d24 // p4 + q0
+ vrshrn.i16 d6, q6, #4 // out q0
+ vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 q5, q5, q9
+ vaddl.u8 q4, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vaddl.u8 q9, d20, d25 // p3 + q1
+ vrshrn.i16 d7, q6, #4 // out q1
+ vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 q9, q4, q9
+ vaddl.u8 q5, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vaddl.u8 q10, d21, d26 // p2 + q2
+ vrshrn.i16 d8, q6, #4 // out q2
+ vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 q5, q5, q10
+ vaddl.u8 q9, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vaddl.u8 q10, d22, d27 // p1 + q3
+ vrshrn.i16 d9, q6, #4 // out q3
+ vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 q9, q9, q10
+ vbif d5, d23, d15 // out p0
+ vrshrn.i16 d10, q6, #4 // out q4
+ vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
+ vrshrn.i16 d11, q6, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r8
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r9
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ adr r8, 7f + CONFIG_THUMB
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_8_neon
+ mov r12, lr
+ sub r10, r0, #2
+ add r0, r10, r1, lsl #2
+ vld1.32 {d22[0]}, [r10], r1
+ vld1.32 {d22[1]}, [r0], r1
+ vld1.32 {d23[0]}, [r10], r1
+ vld1.32 {d23[1]}, [r0], r1
+ vld1.32 {d24[0]}, [r10], r1
+ vld1.32 {d24[1]}, [r0], r1
+ vld1.32 {d25[0]}, [r10], r1
+ vld1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_6_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_8_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_16_8_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.8 {d17}, [r10, :64], r1 // p6
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d18}, [r10, :64], r1 // p5
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d19}, [r10, :64], r1 // p4
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d28}, [r0, :64], r1 // q4
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d29}, [r0, :64], r1 // q5
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_8_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d0}, [r10, :64], r1 // p5
+ vst1.8 {d6}, [r0, :64], r1 // q0
+ vst1.8 {d1}, [r10, :64], r1 // p4
+ vst1.8 {d7}, [r0, :64], r1 // q1
+ vst1.8 {d2}, [r10, :64], r1 // p3
+ vst1.8 {d8}, [r0, :64], r1 // q2
+ vst1.8 {d3}, [r10, :64], r1 // p2
+ vst1.8 {d9}, [r0, :64], r1 // q3
+ vst1.8 {d4}, [r10, :64], r1 // p1
+ vst1.8 {d10}, [r0, :64], r1 // q4
+ vst1.8 {d5}, [r10, :64], r1 // p0
+ vst1.8 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_8_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.8 {d16}, [r10, :64], r1
+ vld1.8 {d24}, [r0, :64], r1
+ vld1.8 {d17}, [r10, :64], r1
+ vld1.8 {d25}, [r0, :64], r1
+ vld1.8 {d18}, [r10, :64], r1
+ vld1.8 {d26}, [r0, :64], r1
+ vld1.8 {d19}, [r10, :64], r1
+ vld1.8 {d27}, [r0, :64], r1
+ vld1.8 {d20}, [r10, :64], r1
+ vld1.8 {d28}, [r0, :64], r1
+ vld1.8 {d21}, [r10, :64], r1
+ vld1.8 {d29}, [r0, :64], r1
+ vld1.8 {d22}, [r10, :64], r1
+ vld1.8 {d30}, [r0, :64], r1
+ vld1.8 {d23}, [r10, :64], r1
+ vld1.8 {d31}, [r0, :64], r1
+
+ transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
+
+ lpf_8_wd16
+
+ sub r0, r0, r1, lsl #3
+ sub r10, r0, #8
+
+ transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
+ transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
+
+ vst1.8 {d16}, [r10, :64], r1
+ vst1.8 {d6}, [r0, :64], r1
+ vst1.8 {d17}, [r10, :64], r1
+ vst1.8 {d7}, [r0, :64], r1
+ vst1.8 {d0}, [r10, :64], r1
+ vst1.8 {d8}, [r0, :64], r1
+ vst1.8 {d1}, [r10, :64], r1
+ vst1.8 {d9}, [r0, :64], r1
+ vst1.8 {d2}, [r10, :64], r1
+ vst1.8 {d10}, [r0, :64], r1
+ vst1.8 {d3}, [r10, :64], r1
+ vst1.8 {d11}, [r0, :64], r1
+ vst1.8 {d4}, [r10, :64], r1
+ vst1.8 {d30}, [r0, :64], r1
+ vst1.8 {d5}, [r10, :64], r1
+ vst1.8 {d31}, [r0, :64], r1
+ bx r12
+
+7:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x03
+.ifc \dir, v
+ vld1.8 {d0}, [r4]!
+ vld1.8 {d1}, [r3]!
+.else
+ vld2.32 {d0[0], d1[0]}, [r3], r4
+ vld2.32 {d0[1], d1[1]}, [r3], r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ vld1.8 {d5[]}, [r5] // sharp[0]
+ add r5, r5, #8
+ vmov.i32 d2, #0xff
+ vdup.32 d13, r6 // vmask[0]
+
+ vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
+ vand d1, d1, d2
+ vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
+ vmov.i8 d4, #1
+ vld1.8 {d6[]}, [r5] // sharp[1]
+ sub r5, r5, #8
+ vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
+ vtst.32 d2, d1, d2 // L != 0
+ vmul.i32 d1, d1, d4 // L
+.ifc \type, y
+ vdup.32 d15, r2 // vmask[2]
+.endif
+ vdup.32 d14, r7 // vmask[1]
+ vmov r10, r11, d2
+ orrs r10, r10, r11
+ beq 7f // if (!L) continue;
+ vneg.s8 d5, d5 // -sharp[0]
+ movrel_local r10, word_12
+ vshr.u8 d12, d1, #4 // H
+ vld1.32 {d16}, [r10, :64]
+ vshl.s8 d3, d1, d5 // L >> sharp[0]
+.ifc \type, y
+ vtst.32 d15, d15, d16 // if (vmask[2] & bits)
+.endif
+ vmov.i8 d7, #2
+ vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
+ vadd.i8 d0, d1, d7 // L + 2
+ vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
+ vadd.u8 d0, d0, d0 // 2*(L + 2)
+ vtst.32 d14, d14, d16 // if (vmask[1] & bits)
+ vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
+ vtst.32 d13, d13, d16 // if (vmask[0] & bits)
+ vand d13, d13, d2 // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst r2, #0x03
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x03
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #3
+.else
+7:
+.endif
+8:
+ lsrs r6, r6, #2 // vmask[0] >>= 2
+ lsr r7, r7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr r2, r2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ bne 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12, align=4
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/32/loopfilter16.S b/third_party/dav1d/src/arm/32/loopfilter16.S
new file mode 100644
index 0000000000..d7daf21f1a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter16.S
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_4_wd\wd\()_neon
+ vabd.u16 d0, d22, d23 // abs(p1 - p0)
+ vabd.u16 d1, d25, d24 // abs(q1 - q0)
+ vabd.u16 d2, d23, d24 // abs(p0 - q0)
+ vabd.u16 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u16 d4, d21, d22 // abs(p2 - p1)
+ vabd.u16 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u16 d6, d20, d21 // abs(p3 - p2)
+ vabd.u16 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u16 d4, d4, d5
+.endif
+ vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vshr.u16 d3, d3, #1
+.if \wd >= 8
+ vmax.u16 d4, d4, d6
+.endif
+ vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u16 d4, d0, d4
+ vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm && wd >= 4 (implicit)
+.if \wd >= 6
+ vmov d14, d1 // fm && wd > 4 (implicit)
+.endif
+.if \wd >= 16
+ vmov d15, d1 // fm && wd == 16 (implicit)
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i16 d10, #1
+ vabd.u16 d2, d21, d23 // abs(p2 - p0)
+ vabd.u16 d3, d22, d23 // abs(p1 - p0)
+ vabd.u16 d4, d25, d24 // abs(q1 - q0)
+ vabd.u16 d5, d26, d24 // abs(q2 - q0)
+ vdup.16 d9, r9 // bitdepth_min_8
+.if \wd >= 8
+ vabd.u16 d6, d20, d23 // abs(p3 - p0)
+ vabd.u16 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u16 d2, d2, d3
+ vmax.u16 d4, d4, d5
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vmax.u16 d2, d2, d4
+ vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ vmax.u16 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u16 d3, d17, d23 // abs(p6 - p0)
+ vabd.u16 d4, d18, d23 // abs(p5 - p0)
+ vabd.u16 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u16 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u16 d6, d28, d24 // abs(q4 - q0)
+ vabd.u16 d7, d29, d24 // abs(q5 - q0)
+ vabd.u16 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u16 d3, d3, d4
+ vmax.u16 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u16 d7, d7, d8
+ vmax.u16 d3, d3, d5
+ vmax.u16 d3, d3, d7
+ vcge.u16 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vdup.16 d3, r8 // bitdepth_max
+ vsub.u16 d2, d22, d25 // p1 - q1
+ vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1
+ vcgt.u16 d0, d0, d12 // hev
+ vmvn d9, d3 // - 128 * (1 << bitdepth_min_8)
+ vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1)
+ vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1)
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vsub.u16 d2, d24, d23
+ vmov.i16 d6, #3
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vmul.i16 d2, d2, d6
+ vmov.i16 d7, #4
+ vadd.i16 d2, d2, d4
+ vmin.s16 d2, d2, d3 // f = iclip_diff()
+ vmax.s16 d2, d2, d9 // f = iclip_diff()
+ vqadd.s16 d4, d7, d2 // f + 4
+ vqadd.s16 d5, d6, d2 // f + 3
+ vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ vshr.s16 d4, d4, #3 // f1
+ vshr.s16 d5, d5, #3 // f2
+ vmov.i16 d9, #0
+ vdup.16 d3, r8 // bitdepth_max
+ vqadd.s16 d2, d23, d5 // p0 + f2
+ vqsub.s16 d6, d24, d4 // q0 - f1
+ vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1
+ vmin.s16 d2, d2, d3 // out p0 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q0 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p0 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q0 = iclip_pixel()
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vqadd.s16 d2, d22, d4 // p1 + f
+ vqsub.s16 d6, d25, d4 // q1 - f
+ vmin.s16 d2, d2, d3 // out p1 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q1 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p1 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q1 = iclip_pixel()
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vadd.i16 d0, d21, d21 // p2 * 2
+ vadd.i16 d2, d21, d22 // p2 + p1
+ vadd.i16 d4, d22, d23 // p1 + p0
+ vadd.i16 d6, d23, d24 // p0 + q0
+ vadd.i16 d8, d0, d2
+ vadd.i16 d10, d4, d6
+ vadd.i16 d12, d24, d25 // q0 + q1
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d0
+ vadd.i16 d10, d25, d26 // q1 + q2
+ vrshr.u16 d0, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d12
+ vsub.i16 d10, d10, d2
+ vadd.i16 d12, d26, d26 // q2 + q2
+ vrshr.u16 d1, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d4
+ vrshr.u16 d2, d8, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 d8, d8, d12
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshr.u16 d3, d8, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vadd.i16 d0, d20, d21 // p3 + p2
+ vadd.i16 d2, d22, d25 // p1 + q1
+ vadd.i16 d4, d20, d22 // p3 + p1
+ vadd.i16 d6, d23, d26 // p0 + q2
+ vadd.i16 d8, d0, d0 // 2 * (p3 + p2)
+ vadd.i16 d9, d23, d24 // p0 + q0
+ vadd.i16 d8, d8, d4 // + p3 + p1
+ vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2
+ vadd.i16 d8, d8, d9 // + p0 + q0
+ vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1
+ vrshr.u16 d10, d8, #3 // out p2
+
+ vadd.i16 d8, d8, d2
+ vadd.i16 d0, d20, d23 // p3 + p0
+ vadd.i16 d2, d24, d27 // q0 + q3
+ vrshr.u16 d11, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0
+ vadd.i16 d4, d21, d24 // p2 + q0
+ vadd.i16 d6, d25, d27 // q1 + q3
+ vrshr.u16 d12, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d2
+ vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0
+ vadd.i16 d0, d22, d25 // p1 + q1
+ vadd.i16 d2, d26, d27 // q2 + q3
+ vrshr.u16 d13, d8, #3 // out q0
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1
+ vrshr.u16 d0, d8, #3 // out q1
+
+ vadd.i16 d8, d8, d2
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshr.u16 d1, d8, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vadd.i16 d2, d17, d17 // p6 + p6
+ vadd.i16 d4, d17, d18 // p6 + p5
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vadd.i16 d12, d2, d4
+ vadd.i16 d10, d6, d8
+ vadd.i16 d6, d17, d21 // p6 + p2
+ vadd.i16 d12, d12, d10
+ vadd.i16 d8, d17, d22 // p6 + p1
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vadd.i16 d6, d6, d8
+ vadd.i16 d8, d19, d24 // p4 + q0
+ vadd.i16 d12, d12, d6
+ vadd.i16 d10, d10, d8
+ vadd.i16 d6, d20, d25 // p3 + q1
+ vadd.i16 d12, d12, d10
+ vsub.i16 d6, d6, d2
+ vadd.i16 d2, d21, d26 // p2 + q2
+ vrshr.u16 d0, d12, #4 // out p5
+ vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 d2, d2, d4
+ vadd.i16 d4, d22, d27 // p1 + q3
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vrshr.u16 d1, d12, #4 // out p4
+ vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 d4, d4, d6
+ vadd.i16 d6, d23, d28 // p0 + q4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vrshr.u16 d2, d12, #4 // out p3
+ vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 d6, d6, d8
+ vadd.i16 d8, d24, d29 // q0 + q5
+ vadd.i16 d4, d17, d21 // p6 + p2
+ vrshr.u16 d3, d12, #4 // out p2
+ vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 d8, d8, d4
+ vadd.i16 d6, d25, d30 // q1 + q6
+ vadd.i16 d10, d17, d22 // p6 + p1
+ vrshr.u16 d4, d12, #4 // out p1
+ vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 d6, d6, d10
+ vadd.i16 d8, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vrshr.u16 d5, d12, #4 // out p0
+ vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 d8, d8, d10
+ vadd.i16 d10, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vadd.i16 d18, d19, d24 // p4 + q0
+ vrshr.u16 d6, d12, #4 // out q0
+ vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 d10, d10, d18
+ vadd.i16 d8, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vadd.i16 d18, d20, d25 // p3 + q1
+ vrshr.u16 d7, d12, #4 // out q1
+ vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 d18, d8, d18
+ vadd.i16 d10, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vadd.i16 d20, d21, d26 // p2 + q2
+ vrshr.u16 d8, d12, #4 // out q2
+ vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 d10, d10, d20
+ vadd.i16 d18, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vadd.i16 d20, d22, d27 // p1 + q3
+ vrshr.u16 d9, d12, #4 // out q3
+ vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 d18, d18, d20
+ vbif d5, d23, d15 // out p0
+ vrshr.u16 d10, d12, #4 // out q4
+ vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6)
+ vrshr.u16 d11, d12, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r6
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r7
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_4_wd16
+ adr r6, 7f + CONFIG_THUMB
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd16_neon
+.endm
+
+.macro lpf_4_wd8
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd8_neon
+.endm
+
+.macro lpf_4_wd6
+ bl lpf_4_wd6_neon
+.endm
+
+.macro lpf_4_wd4
+ bl lpf_4_wd4_neon
+.endm
+
+function lpf_v_4_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_4_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #1
+ vld1.16 {d22}, [r10], r1
+ vld1.16 {d24}, [r0], r1
+ vld1.16 {d23}, [r10], r1
+ vld1.16 {d25}, [r0], r1
+ add r0, r0, #4
+
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_6_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_4_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd6
+
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_8_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_4_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd8
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_16_4_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.16 {d17}, [r10, :64], r1 // p6
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d18}, [r10, :64], r1 // p5
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d19}, [r10, :64], r1 // p4
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d28}, [r0, :64], r1 // q4
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d29}, [r0, :64], r1 // q5
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_4_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d0}, [r10, :64], r1 // p5
+ vst1.16 {d6}, [r0, :64], r1 // q0
+ vst1.16 {d1}, [r10, :64], r1 // p4
+ vst1.16 {d7}, [r0, :64], r1 // q1
+ vst1.16 {d2}, [r10, :64], r1 // p3
+ vst1.16 {d8}, [r0, :64], r1 // q2
+ vst1.16 {d3}, [r10, :64], r1 // p2
+ vst1.16 {d9}, [r0, :64], r1 // q3
+ vst1.16 {d4}, [r10, :64], r1 // p1
+ vst1.16 {d10}, [r0, :64], r1 // q4
+ vst1.16 {d5}, [r10, :64], r1 // p0
+ vst1.16 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_4_neon
+ mov r12, lr
+ sub r10, r0, #16
+ sub r0, r0, #8
+ vld1.16 {d16}, [r10, :64], r1
+ vld1.16 {d20}, [r0, :64], r1
+ vld1.16 {d17}, [r10, :64], r1
+ vld1.16 {d21}, [r0, :64], r1
+ vld1.16 {d18}, [r10, :64], r1
+ vld1.16 {d22}, [r0, :64], r1
+ vld1.16 {d19}, [r10, :64], r1
+ vld1.16 {d23}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vld1.16 {d24}, [r10, :64], r1
+ vld1.16 {d28}, [r0, :64], r1
+ vld1.16 {d25}, [r10, :64], r1
+ vld1.16 {d29}, [r0, :64], r1
+ vld1.16 {d26}, [r10, :64], r1
+ vld1.16 {d30}, [r0, :64], r1
+ vld1.16 {d27}, [r10, :64], r1
+ vld1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ lpf_4_wd16
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q8, q0, d16, d17, d0, d1
+ transpose_4x4h q1, q2, d2, d3, d4, d5
+ transpose_4x4h q3, q4, d6, d7, d8, d9
+ transpose_4x4h q5, q15, d10, d11, d30, d31
+ sub r10, r0, #16
+ sub r0, r0, #8
+
+ vst1.16 {d16}, [r10, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d17}, [r10, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d0}, [r10, :64], r1
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d1}, [r10, :64], r1
+ vst1.16 {d5}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vst1.16 {d6}, [r10, :64], r1
+ vst1.16 {d10}, [r0, :64], r1
+ vst1.16 {d7}, [r10, :64], r1
+ vst1.16 {d11}, [r0, :64], r1
+ vst1.16 {d8}, [r10, :64], r1
+ vst1.16 {d30}, [r0, :64], r1
+ vst1.16 {d9}, [r10, :64], r1
+ vst1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ bx r12
+
+7:
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded
+ sub sp, sp, #8
+ clz r9, r8
+ rsb r9, r9, #24 // bitdepth_min_8
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x01
+ strd r6, r7, [sp]
+.ifc \dir, v
+ ldrb r10, [r4], #4
+ ldrb r11, [r3], #4
+.else
+ ldrb r10, [r3]
+ ldrb r11, [r3, #4]
+ add r3, r3, r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ orrs r12, r10, r11
+ vdup.16 d31, r9 // bitdepth_min_8
+ beq 7f // if (!(l[0][0] | l[offset][0])) continue;
+ cmp r11, #0 // Check for nonzero values in l[0][0]
+ ldrb r6, [r5], #8 // sharp[0]
+ it eq
+ moveq r11, r10 // if (!l[0][0]) L = l[offset][0]
+ ldrb r12, [r5] // sharp[1]
+ lsr r6, r11, r6 // L >> sharp[0]
+ sub r5, r5, #8
+ cmp r12, r6
+ lsr r10, r11, #4 // H
+ add r11, r11, #2 // L + 2
+ it lt
+ movlt r6, r12 // imin(L >> sharp[0], sharp[1])
+ add r11, r11, r11 // 2*(L + 2)
+ cmp r6, #1
+ lsl r10, r10, r9 // H << bitdepth_min_8
+ it lt
+ movlt r6, #1 // imax(imin(), 1) = limit = I
+ vdup.16 d12, r10 // H << bitdepth_min_8
+ add r11, r11, r6 // 2*(L + 2) + limit = E
+ lsl r6, r6, r9 // I << bitdepth_min_8
+ lsl r11, r11, r9 // E << bitdepth_min_8
+ vdup.16 d11, r6 // I << bitdepth_min_8
+ vdup.16 d10, r11 // E << bitdepth_min_8
+
+.ifc \type, y
+ tst r2, #0x01
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_4_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x01
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_4_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_4_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_4_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #2
+.else
+7:
+.endif
+8:
+ ldrd r6, r7, [sp]
+.ifc \type, y
+ lsr r2, r2, #1 // vmask[2] >>= 1
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ lsrs r6, r6, #1 // vmask[0] >>= 1
+ lsr r7, r7, #1 // vmask[1] >>= 1
+ bne 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S
new file mode 100644
index 0000000000..be5c658d6d
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@@ -0,0 +1,791 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[8], intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ mov r8, r5
+ vld1.16 {q0}, [r4, :128]
+ movw r9, #(1 << 14) - (1 << 2)
+ vdup.16 q14, r9
+ vmov.s16 q15, #2048
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #3
+ sub lr, lr, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #3
+
+
+1: // Loop vertically
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q9}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[1]}, [r1]!
+ // Move r2/lr back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vld1.32 {d17[1]}, [r1]!
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q2 to have 3x the first byte at the front.
+ vdup.8 q1, d4[0]
+ vdup.8 q8, d18[0]
+ // Move r2 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+
+2:
+ vmovl.u8 q1, d4
+ vmovl.u8 q2, d5
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ ldrb r11, [r2, r9]
+ ldrb r9, [lr, r9]
+ // Fill q12/q13 with the right padding pixel
+ vdup.16 q12, r11
+ vdup.16 q13, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q10, q11}, [r4]
+
+ vbit q1, q12, q10
+ vbit q2, q12, q11
+ vbit q8, q13, q10
+ vbit q9, q13, q11
+
+4: // Loop horizontally
+ vext.8 q11, q1, q2, #4
+ vext.8 q5, q1, q2, #8
+ vext.8 q10, q1, q2, #2
+ vext.8 q6, q1, q2, #10
+ vext.8 q7, q1, q2, #12
+ vext.8 q4, q1, q2, #6
+ vadd.i16 q5, q5, q11
+ vadd.i16 q6, q6, q10
+ vadd.i16 q7, q7, q1
+ vmul.s16 q3, q4, d0[3]
+ vmla.s16 q3, q5, d1[0]
+ vmla.s16 q3, q6, d1[1]
+ vmla.s16 q3, q7, d1[2]
+
+ vext.8 q4, q8, q9, #4
+ vext.8 q6, q8, q9, #8
+ vext.8 q11, q8, q9, #2
+ vext.8 q7, q8, q9, #10
+ vadd.i16 q6, q6, q4
+ vext.8 q4, q8, q9, #12
+ vext.8 q5, q8, q9, #6
+ vadd.i16 q7, q7, q11
+ vadd.i16 q4, q4, q8
+ vmul.s16 q10, q5, d0[3]
+ vmla.s16 q10, q6, d1[0]
+ vmla.s16 q10, q7, d1[1]
+ vmla.s16 q10, q4, d1[2]
+
+ vext.8 q1, q1, q2, #6
+ vext.8 q8, q8, q9, #6
+ vshl.s16 q1, q1, #7
+ vshl.s16 q8, q8, #7
+ vsub.s16 q1, q1, q14
+ vsub.s16 q8, q8, q14
+ vqadd.s16 q3, q3, q1
+ vqadd.s16 q10, q10, q8
+ vshr.s16 q3, q3, #3
+ vshr.s16 q10, q10, #3
+ vadd.s16 q3, q3, q15
+ vadd.s16 q10, q10, q15
+ subs r5, r5, #8
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q10}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q1, q2
+ vmov q8, q9
+ vld1.8 {d4}, [r2]!
+ vld1.8 {d18}, [lr]!
+ vmovl.u8 q2, d4
+ vmovl.u8 q9, d18
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[8], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q6}
+ ldrd r4, r5, [sp, #68]
+ ldrd r6, r7, [sp, #76]
+ mov lr, r4
+ vld1.16 {q0}, [r5, :128]
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vadd.i16 q4, q10, q12
+ vadd.i16 q5, q9, q13
+ vadd.i16 q6, q8, q14
+ vmull.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d8, d1[0]
+ vmlal.s16 q2, d10, d1[1]
+ vmlal.s16 q2, d12, d1[2]
+ vmull.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d9, d1[0]
+ vmlal.s16 q3, d11, d1[1]
+ vmlal.s16 q3, d13, d1[2]
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqmovun.s16 d4, q2
+ vst1.8 {d4}, [r0, :64], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #8
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q6}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #2
+ sub r12, r12, #2
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #2
+
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d8, d9, #1
+ vext.8 d19, d8, d9, #2
+ vaddl.u8 q3, d0, d16
+ vaddw.u8 q3, q3, d17
+ vaddl.u8 q7, d8, d18
+ vaddw.u8 q7, q7, d19
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q5, q6, #2
+ vext.8 q11, q5, q6, #4
+
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddw.u16 q12, q12, d18
+ vaddw.u16 q13, q13, d19
+
+ vaddl.u16 q8, d10, d20
+ vaddl.u16 q9, d11, d21
+ vaddw.u16 q8, q8, d22
+ vaddw.u16 q9, q9, d23
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q8, q9}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #3
+ sub r12, r12, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #3
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 3x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -1
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d0, d1, #3
+ vext.8 d19, d0, d1, #4
+ vext.8 d20, d8, d9, #1
+ vext.8 d21, d8, d9, #2
+ vext.8 d22, d8, d9, #3
+ vext.8 d23, d8, d9, #4
+ vaddl.u8 q3, d0, d16
+ vaddl.u8 q12, d17, d18
+ vaddl.u8 q7, d8, d20
+ vaddl.u8 q13, d21, d22
+ vaddw.u8 q3, q3, d19
+ vaddw.u8 q7, q7, d23
+ vadd.u16 q3, q3, q12
+ vadd.u16 q7, q7, q13
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q1, q2, #6
+ vext.8 q11, q1, q2, #8
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q12, q12, d22
+ vaddw.u16 q13, q13, d23
+ vadd.i32 q12, q12, q8
+ vadd.i32 q13, q13, q9
+ vext.8 q8, q5, q6, #2
+ vext.8 q9, q5, q6, #4
+ vext.8 q10, q5, q6, #6
+ vext.8 q11, q5, q6, #8
+ vaddl.u16 q1, d10, d16
+ vaddl.u16 q5, d11, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q1, q1, d22
+ vaddw.u16 q5, q5, d23
+ vadd.i32 q10, q1, q8
+ vadd.i32 q11, q5, q9
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q10, q11}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S
new file mode 100644
index 0000000000..d699617a87
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ ldr r8, [sp, #116] // bitdepth_max
+ vld1.16 {q0}, [r4, :128]
+ clz r8, r8
+ vmov.i32 q14, #1
+ sub r9, r8, #38 // -(bitdepth + 6)
+ sub r8, r8, #25 // -round_bits_h
+ neg r9, r9 // bitdepth + 6
+ vdup.32 q1, r9
+ vdup.32 q13, r8 // -round_bits_h
+ vmov.i16 q15, #8192
+ vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
+ mov r8, r5
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #6
+ sub lr, lr, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #6
+
+
+1: // Loop vertically
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q4, q5}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d3}, [r1]!
+ // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vld1.16 {d13}, [r1]!
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+ // and shift q2/q3 to have 3x the first pixel at the front.
+ vdup.16 q1, d4[0]
+ vdup.16 q6, d8[0]
+ // Move r2 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ lsl r9, r9, #1
+ ldrh r11, [r2, r9]
+ ldrh r9, [lr, r9]
+ // Fill q11/q12 with the right padding pixel
+ vdup.16 q11, r11
+ vdup.16 q12, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q9, q10}, [r4]
+
+ vbit q2, q11, q9
+ vbit q3, q11, q10
+ vbit q4, q12, q9
+ vbit q5, q12, q10
+
+4: // Loop horizontally
+ vext.8 q7, q2, q3, #4
+ vext.8 q8, q2, q3, #8
+ vext.8 q6, q2, q3, #2
+ vext.8 q9, q2, q3, #10
+ vadd.i16 q8, q8, q7
+ vadd.i16 q9, q9, q6
+ vext.8 q6, q2, q3, #12
+ vext.8 q7, q2, q3, #6
+ vadd.i16 q2, q2, q6
+ vmull.s16 q6, d14, d0[3]
+ vmlal.s16 q6, d16, d1[0]
+ vmlal.s16 q6, d18, d1[1]
+ vmlal.s16 q6, d4, d1[2]
+ vmull.s16 q7, d15, d0[3]
+ vmlal.s16 q7, d17, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d5, d1[2]
+
+ vext.8 q8, q4, q5, #4
+ vext.8 q10, q4, q5, #8
+ vext.8 q9, q4, q5, #2
+ vext.8 q2, q4, q5, #10
+ vadd.i16 q10, q10, q8
+ vadd.i16 q2, q2, q9
+ vext.8 q8, q4, q5, #12
+ vext.8 q9, q4, q5, #6
+ vadd.i16 q4, q4, q8
+ vmull.s16 q8, d18, d0[3]
+ vmlal.s16 q8, d20, d1[0]
+ vmlal.s16 q8, d4, d1[1]
+ vmlal.s16 q8, d8, d1[2]
+ vmull.s16 q9, d19, d0[3]
+ vmlal.s16 q9, d21, d1[0]
+ vmlal.s16 q9, d5, d1[1]
+ vmlal.s16 q9, d9, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vadd.i32 q8, q8, q14
+ vadd.i32 q9, q9, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vrshl.s32 q8, q8, q13
+ vrshl.s32 q9, q9, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vqmovun.s32 d14, q8
+ vqmovun.s32 d15, q9
+ vmin.u16 q6, q6, q10
+ vmin.u16 q7, q7, q10
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ subs r5, r5, #8
+ vst1.16 {q6}, [r0, :128]!
+ vst1.16 {q7}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q2, q3
+ vmov q4, q5
+ vld1.16 {q3}, [r2]!
+ vld1.16 {q5}, [lr]!
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q5}
+ ldrd r4, r5, [sp, #52]
+ ldrd r6, r7, [sp, #60]
+ ldr lr, [sp, #68] // bitdepth_max
+ vld1.16 {q0}, [r5, :128]
+ vdup.16 q5, lr
+ clz lr, lr
+ sub lr, lr, #11 // round_bits_v
+ vdup.32 q4, lr
+ mov lr, r4
+ vneg.s32 q4, q4 // -round_bits_v
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vrshl.s32 q2, q2, q4 // round_bits_v
+ vrshl.s32 q3, q3, q4
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q5 // bitdepth_max
+ vst1.16 {q2}, [r0, :128], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-q15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #16
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q5}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #4
+ sub r12, r12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #4
+
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #6
+ sub r12, r12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #6
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 3x the first pixel at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -2
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ vext.8 q8, q0, q1, #6
+ vext.8 q10, q4, q5, #6
+ vext.8 q9, q0, q1, #8
+ vext.8 q11, q4, q5, #8
+ vadd.i16 q2, q2, q8
+ vadd.i16 q3, q3, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d1, d1
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d9, d9
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/32/looprestoration_common.S b/third_party/dav1d/src/arm/32/looprestoration_common.S
new file mode 100644
index 0000000000..b080bb5115
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_common.S
@@ -0,0 +1,453 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #2 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Sum all h+2 lines with the main loop
+ add lr, lr, #2
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q8-q13 and q0-q2 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q8, q9}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q1}, [r6, :128], r8
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q10, q8
+ vmov q11, q9
+ vmov q1, q0
+ vmov q12, q8
+ vmov q13, q9
+ vmov q2, q0
+
+3:
+ subs r3, r3, #1
+.macro add3
+ vadd.i32 q8, q8, q10
+ vadd.i32 q9, q9, q11
+ vadd.i16 q0, q0, q1
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i16 q0, q0, q2
+ vst1.32 {q8, q9}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ vmov q10, q12
+ vmov q11, q13
+ vmov q1, q2
+ ble 4f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3b
+
+4:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ add3
+
+5: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ pop {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ push {r4-r9,lr}
+ vpush {q5-q7}
+ ldr r4, [sp, #76]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #8 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 0f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Handle h+2 lines with the main loop
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub r3, r3, #1 // Handle h-1 lines with the main loop
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q6-q15 and q0-q3,q5 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q6, q7}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vmov q10, q6
+ vmov q11, q7
+ vmov q2, q0
+ vmov q12, q6
+ vmov q13, q7
+ vmov q3, q0
+
+3:
+ cmp r3, #0
+ beq 4f
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+
+3:
+ // Start of vertical loop
+ subs r3, r3, #2
+.macro add5
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+ vadd.i16 q0, q0, q1
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i16 q0, q0, q2
+ vadd.i32 q6, q6, q12
+ vadd.i32 q7, q7, q13
+ vadd.i16 q0, q0, q3
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q15
+ vadd.i16 q0, q0, q5
+ vst1.32 {q6, q7}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add5
+.macro shift2
+ vmov q6, q10
+ vmov q7, q11
+ vmov q0, q2
+ vmov q8, q12
+ vmov q9, q13
+ vmov q1, q3
+ vmov q10, q14
+ vmov q11, q15
+ vmov q2, q5
+.endm
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ ble 5f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ add5
+ b 6f
+
+5:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 6f
+ // !LR_HAVE_BOTTOM
+ cmp r3, #0
+ bne 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ // Pad the past-edge row from the last content row.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // r3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ vmov q12, q10
+ vmov q13, q11
+ vmov q3, q2
+ vmov q14, q10
+ vmov q15, q11
+ vmov q5, q2
+ add5
+ add r0, r0, r7
+ add r1, r1, r8
+ b 6f
+
+6: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ vpop {q5-q7}
+ pop {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #2 // h += 2
+ clz r6, r5
+ vmov.i32 q15, #9 // n
+ movw r5, #455
+ mov lr, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #3 // h += 3
+ clz r6, r5
+ asr r3, r3, #1 // h /= 2
+ vmov.i32 q15, #25 // n
+ mov r5, #164
+ mov lr, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel r12, X(sgr_x_by_x)
+ sub r6, r6, #24 // -bitdepth_min_8
+ vld1.8 {q8, q9}, [r12, :128]!
+ add r7, r6, r6 // -2*bitdepth_min_8
+ vmov.i8 q11, #5
+ vmov.i8 d10, #55 // idx of last 5
+ vld1.8 {q10}, [r12, :128]
+ vmov.i8 d11, #72 // idx of last 4
+ vmov.i8 d12, #101 // idx of last 3
+ vmov.i8 d13, #169 // idx of last 2
+ vmov.i8 d14, #254 // idx of last 1
+ vmov.i8 d15, #32 // elements consumed in first vtbl
+ add r2, r2, #2 // w += 2
+ add r12, r2, #7
+ bic r12, r12, #7 // aligned w
+ sub r12, lr, r12 // increment between rows
+ vdup.32 q12, r4
+ sub r0, r0, #(4*(SUM_STRIDE))
+ sub r1, r1, #(2*(SUM_STRIDE))
+ mov r4, r2 // backup of w
+ vsub.i8 q8, q8, q11
+ vsub.i8 q9, q9, q11
+ vsub.i8 q10, q10, q11
+1:
+ vld1.32 {q0, q1}, [r0, :128] // a
+ vld1.16 {q2}, [r1, :128] // b
+ vdup.32 q13, r7 // -2*bitdepth_min_8
+ vdup.16 q14, r6 // -bitdepth_min_8
+ subs r2, r2, #8
+ vrshl.s32 q0, q0, q13
+ vrshl.s32 q1, q1, q13
+ vrshl.s16 q4, q2, q14
+ vmul.i32 q0, q0, q15 // a * n
+ vmul.i32 q1, q1, q15 // a * n
+ vmull.u16 q3, d8, d8 // b * b
+ vmull.u16 q4, d9, d9 // b * b
+ vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
+ vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
+ vmul.i32 q0, q0, q12 // p * s
+ vmul.i32 q1, q1, q12 // p * s
+ vqshrn.u32 d0, q0, #16
+ vqshrn.u32 d1, q1, #16
+ vqrshrn.u16 d0, q0, #4 // imin(z, 255)
+
+ vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
+ vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
+ vtbl.8 d1, {q8, q9}, d0
+ vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
+ vsub.i8 d9, d0, d15 // indices for vtbx
+ vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
+ vadd.i8 d2, d2, d3
+ vtbx.8 d1, {q10}, d9
+ vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
+ vadd.i8 d6, d6, d7
+ vadd.i8 d8, d8, d22
+ vadd.i8 d2, d2, d6
+ vadd.i8 d1, d1, d8
+ vadd.i8 d1, d1, d2
+ vmovl.u8 q0, d1 // x
+
+ vmov.i16 q13, #256
+ vdup.32 q14, r5 // one_by_x
+
+ vmull.u16 q1, d0, d4 // x * BB[i]
+ vmull.u16 q2, d1, d5 // x * BB[i]
+ vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
+ vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
+ vrshr.s32 q1, q1, #12 // AA[i]
+ vrshr.s32 q2, q2, #12 // AA[i]
+ vsub.i16 q0, q13, q0 // 256 - x
+
+ vst1.32 {q1, q2}, [r0, :128]!
+ vst1.16 {q0}, [r1, :128]!
+ bgt 1b
+
+ subs r3, r3, #1
+ ble 0f
+ add r0, r0, r12, lsl #2
+ add r1, r1, r12, lsl #1
+ mov r2, r4
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r7,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/looprestoration_tmpl.S b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
new file mode 100644
index 0000000000..8a9940bb3a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
@@ -0,0 +1,600 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ sub r7, r3, #(4*SUM_STRIDE)
+ add r8, r3, #(4*SUM_STRIDE)
+ sub r9, r4, #(2*SUM_STRIDE)
+ add r10, r4, #(2*SUM_STRIDE)
+ mov r11, #SUM_STRIDE
+ mov r12, #FILTER_OUT_STRIDE
+ add lr, r5, #3
+ bic lr, lr, #3 // Aligned width
+.if \bpc == 8
+ sub r2, r2, lr
+.else
+ sub r2, r2, lr, lsl #1
+.endif
+ sub r12, r12, lr
+ sub r11, r11, lr
+ sub r11, r11, #4 // We read 4 extra elements from both a and b
+ mov lr, r5
+ vmov.i16 q14, #3
+ vmov.i32 q15, #3
+1:
+ vld1.16 {q0}, [r9, :128]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r10, :128]!
+ vld1.32 {q8, q9}, [r7, :128]!
+ vld1.32 {q10, q11}, [r3, :128]!
+ vld1.32 {q12, q13}, [r8, :128]!
+
+2:
+ subs r5, r5, #4
+ vext.8 d6, d0, d1, #2 // -stride
+ vext.8 d7, d2, d3, #2 // 0
+ vext.8 d8, d4, d5, #2 // +stride
+ vext.8 d9, d0, d1, #4 // +1-stride
+ vext.8 d10, d2, d3, #4 // +1
+ vext.8 d11, d4, d5, #4 // +1+stride
+ vadd.i16 d2, d2, d6 // -1, -stride
+ vadd.i16 d7, d7, d8 // 0, +stride
+ vadd.i16 d0, d0, d9 // -1-stride, +1-stride
+ vadd.i16 d2, d2, d7
+ vadd.i16 d4, d4, d11 // -1+stride, +1+stride
+ vadd.i16 d2, d2, d10 // +1
+ vadd.i16 d0, d0, d4
+
+ vext.8 q3, q8, q9, #4 // -stride
+ vshl.i16 d2, d2, #2
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q10, q11, #4 // 0
+ vext.8 q6, q10, q11, #8 // +1
+ vmla.i16 d2, d0, d28 // * 3 -> a
+ vadd.i32 q3, q3, q10 // -stride, -1
+ vadd.i32 q8, q8, q4 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q6 // 0, +1
+ vadd.i32 q8, q8, q12 // -1+stride
+ vadd.i32 q3, q3, q5
+ vext.8 q7, q12, q13, #4 // +stride
+ vext.8 q10, q12, q13, #8 // +1+stride
+.if \bpc == 8
+ vld1.32 {d24[0]}, [r1, :32]! // src
+.else
+ vld1.16 {d24}, [r1, :64]! // src
+.endif
+ vadd.i32 q3, q3, q7 // +stride
+ vadd.i32 q8, q8, q10 // +1+stride
+ vshl.i32 q3, q3, #2
+ vmla.i32 q3, q8, q15 // * 3 -> b
+.if \bpc == 8
+ vmovl.u8 q12, d24 // src
+.endif
+ vmov d0, d1
+ vmlal.u16 q3, d2, d24 // b + a * src
+ vmov d2, d3
+ vrshrn.i32 d6, q3, #9
+ vmov d4, d5
+ vst1.16 {d6}, [r0]!
+
+ ble 3f
+ vmov q8, q9
+ vmov q10, q11
+ vmov q12, q13
+ vld1.16 {d1}, [r9, :64]!
+ vld1.16 {d3}, [r4, :64]!
+ vld1.16 {d5}, [r10, :64]!
+ vld1.32 {q9}, [r7, :128]!
+ vld1.32 {q11}, [r3, :128]!
+ vld1.32 {q13}, [r8, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r12, lsl #1
+ add r1, r1, r2
+ add r3, r3, r11, lsl #2
+ add r7, r7, r11, lsl #2
+ add r8, r8, r11, lsl #2
+ add r4, r4, r11, lsl #1
+ add r9, r9, r11, lsl #1
+ add r10, r10, r11, lsl #1
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ add r7, r3, #(4*(SUM_STRIDE))
+ sub r3, r3, #(4*(SUM_STRIDE))
+ add r8, r4, #(2*(SUM_STRIDE))
+ sub r4, r4, #(2*(SUM_STRIDE))
+ mov r9, #(2*SUM_STRIDE)
+ mov r10, #FILTER_OUT_STRIDE
+ add r11, r5, #7
+ bic r11, r11, #7 // Aligned width
+.if \bpc == 8
+ sub r2, r2, r11
+.else
+ sub r2, r2, r11, lsl #1
+.endif
+ sub r10, r10, r11
+ sub r9, r9, r11
+ sub r9, r9, #4 // We read 4 extra elements from a
+ sub r12, r9, #4 // We read 8 extra elements from b
+ mov lr, r5
+
+1:
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.16 {q2, q3}, [r8, :128]!
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.32 {q11, q12}, [r7, :128]!
+ vld1.32 {q10}, [r3, :128]!
+ vld1.32 {q13}, [r7, :128]!
+
+2:
+ vmov.i16 q14, #5
+ vmov.i16 q15, #6
+ subs r5, r5, #8
+ vext.8 q4, q0, q1, #4 // +1-stride
+ vext.8 q5, q2, q3, #4 // +1+stride
+ vext.8 q6, q0, q1, #2 // -stride
+ vext.8 q7, q2, q3, #2 // +stride
+ vadd.i16 q0, q0, q4 // -1-stride, +1-stride
+ vadd.i16 q5, q2, q5 // -1+stride, +1+stride
+ vadd.i16 q2, q6, q7 // -stride, +stride
+ vadd.i16 q0, q0, q5
+
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q9, q10, #8
+ vext.8 q6, q11, q12, #8 // +1+stride
+ vext.8 q7, q12, q13, #8
+ vmul.i16 q0, q0, q14 // * 5
+ vmla.i16 q0, q2, q15 // * 6
+ vadd.i32 q4, q4, q8 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q9
+ vadd.i32 q6, q6, q11 // -1+stride, +1+stride
+ vadd.i32 q7, q7, q12
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q7
+ vext.8 q6, q8, q9, #4 // -stride
+ vext.8 q7, q9, q10, #4
+ vext.8 q8, q11, q12, #4 // +stride
+ vext.8 q11, q12, q13, #4
+
+.if \bpc == 8
+ vld1.8 {d4}, [r1, :64]!
+.else
+ vld1.8 {q2}, [r1, :128]!
+.endif
+
+ vmov.i32 q14, #5
+ vmov.i32 q15, #6
+
+ vadd.i32 q6, q6, q8 // -stride, +stride
+ vadd.i32 q7, q7, q11
+ vmul.i32 q4, q4, q14 // * 5
+ vmla.i32 q4, q6, q15 // * 6
+ vmul.i32 q5, q5, q14 // * 5
+ vmla.i32 q5, q7, q15 // * 6
+
+.if \bpc == 8
+ vmovl.u8 q2, d4
+.endif
+ vmlal.u16 q4, d0, d4 // b + a * src
+ vmlal.u16 q5, d1, d5 // b + a * src
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #9
+ vrshrn.i32 d9, q5, #9
+ vmov q2, q3
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q3}, [r8, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ vld1.32 {q12, q13}, [r7, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ add r3, r3, r9, lsl #2
+ add r7, r7, r9, lsl #2
+ add r4, r4, r12, lsl #1
+ add r8, r8, r12, lsl #1
+
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.32 {q10}, [r3, :128]!
+
+ vmov.i16 q12, #5
+ vmov.i16 q13, #6
+
+4:
+ subs r5, r5, #8
+ vext.8 q3, q0, q1, #4 // +1
+ vext.8 q2, q0, q1, #2 // 0
+ vadd.i16 q0, q0, q3 // -1, +1
+
+ vext.8 q4, q8, q9, #4 // 0
+ vext.8 q5, q9, q10, #4
+ vext.8 q6, q8, q9, #8 // +1
+ vext.8 q7, q9, q10, #8
+ vmul.i16 q2, q2, q13 // * 6
+ vmla.i16 q2, q0, q12 // * 5 -> a
+.if \bpc == 8
+ vld1.8 {d22}, [r1, :64]!
+.else
+ vld1.16 {q11}, [r1, :128]!
+.endif
+ vadd.i32 q8, q8, q6 // -1, +1
+ vadd.i32 q9, q9, q7
+.if \bpc == 8
+ vmovl.u8 q11, d22
+.endif
+ vmul.i32 q4, q4, q15 // * 6
+ vmla.i32 q4, q8, q14 // * 5 -> b
+ vmul.i32 q5, q5, q15 // * 6
+ vmla.i32 q5, q9, q14 // * 5 -> b
+
+ vmlal.u16 q4, d4, d22 // b + a * src
+ vmlal.u16 q5, d5, d23
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+ vmov q8, q10
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 5f
+ vld1.16 {q1}, [r4, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ b 4b
+
+5:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
+ sub r4, r4, r11, lsl #1
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ sub r3, r3, #16
+ sub r4, r4, #16
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+.if \bpc == 16
+ ldr r8, [sp, #44]
+.endif
+ vdup.16 d31, r7
+ cmp r6, #2
+.if \bpc == 16
+ vdup.16 q14, r8
+.endif
+ add r9, r0, r1
+ add r12, r2, r3
+ add lr, r4, #2*FILTER_OUT_STRIDE
+ mov r7, #(4*FILTER_OUT_STRIDE)
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r8, r5, #7
+ bic r8, r8, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r8
+ sub r3, r3, r8
+.else
+ sub r1, r1, r8, lsl #1
+ sub r3, r3, r8, lsl #1
+.endif
+ sub r7, r7, r8, lsl #1
+ mov r8, r5
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r12, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r12, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [lr, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vshll.u16 q10, d16, #7 // u << 7
+ vshll.u16 q11, d17, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vmlal.s16 q10, d18, d31 // v
+ vmlal.s16 q11, d19, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vrshrn.i32 d20, q10, #11
+ vrshrn.i32 d21, q11, #11
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d20, q10
+ vst1.8 {d4}, [r0, :64]!
+ vst1.8 {d20}, [r9, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqrshrun.s32 d20, q10, #11
+ vqrshrun.s32 d21, q11, #11
+ vmin.u16 q2, q2, q14
+ vmin.u16 q10, q10, q14
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q10}, [r9, :128]!
+.endif
+ bgt 1b
+
+ sub r6, r6, #2
+ cmp r6, #1
+ blt 0f
+ mov r5, r8
+ add r0, r0, r1
+ add r9, r9, r1
+ add r2, r2, r3
+ add r12, r12, r3
+ add r4, r4, r7
+ add lr, lr, r7
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vqmovun.s16 d2, q2
+ vst1.8 {d2}, [r0, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vmin.u16 q2, q2, q14
+ vst1.16 {q2}, [r0, :128]!
+.endif
+ bgt 2b
+0:
+ pop {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.if \bpc == 8
+ ldr r8, [sp, #52]
+.else
+ ldrd r8, r9, [sp, #52]
+.endif
+ cmp r7, #2
+ add r10, r0, r1
+ add r11, r2, r3
+ add r12, r4, #2*FILTER_OUT_STRIDE
+ add lr, r5, #2*FILTER_OUT_STRIDE
+ vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
+.if \bpc == 16
+ vdup.16 q14, r9
+.endif
+ mov r8, #4*FILTER_OUT_STRIDE
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r9, r6, #7
+ bic r9, r9, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r9
+ sub r3, r3, r9
+.else
+ sub r1, r1, r9, lsl #1
+ sub r3, r3, r9, lsl #1
+.endif
+ sub r8, r8, r9, lsl #1
+ mov r9, r6
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r11, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r11, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [r12, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ vld1.16 {q10}, [lr, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vsub.i16 q10, q10, q8 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vshll.u16 q11, d16, #7 // u << 7
+ vshll.u16 q8, d17, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vrshrn.i32 d22, q11, #11
+ vrshrn.i32 d23, q8, #11
+ vqmovun.s16 d6, q3
+ vqmovun.s16 d22, q11
+ vst1.8 {d6}, [r0, :64]!
+ vst1.8 {d22}, [r10, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vqrshrun.s32 d22, q11, #11
+ vqrshrun.s32 d23, q8, #11
+ vmin.u16 q3, q3, q14
+ vmin.u16 q11, q11, q14
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q11}, [r10, :128]!
+.endif
+ bgt 1b
+
+ subs r7, r7, #2
+ cmp r7, #1
+ blt 0f
+ mov r6, r9
+ add r0, r0, r1
+ add r10, r10, r1
+ add r2, r2, r3
+ add r11, r11, r3
+ add r4, r4, r8
+ add r12, r12, r8
+ add r5, r5, r8
+ add lr, lr, r8
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vqmovun.s16 d6, q3
+ vst1.8 {d6}, [r0, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vmin.u16 q3, q3, q14
+ vst1.16 {q3}, [r0, :128]!
+.endif
+ bgt 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S
new file mode 100644
index 0000000000..1b60a7bdb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -0,0 +1,3340 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vadd.i16 \t0, \t0, \t2
+ vadd.i16 \t1, \t1, \t3
+ vqrshrun.s16 \dst0, \t0, #5
+ vqrshrun.s16 \dst1, \t1, #5
+.endm
+
+.macro w_avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q15
+ vqdmulh.s16 \t1, \t1, q15
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro mask dst0, dst1, t0, t1, t2, t3
+ vld1.8 {q14}, [lr, :128]!
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vmul.i8 q14, q14, q15
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vshll.i8 q13, d28, #8
+ vshll.i8 q14, d29, #8
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q13
+ vqdmulh.s16 \t1, \t1, q14
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ push {r4-r6,lr}
+ ldrd r4, r5, [sp, #16]
+ clz r4, r4
+.ifnc \type, avg
+ ldr lr, [sp, #24]
+.endif
+.ifc \type, w_avg
+ vdup.s16 q15, lr
+ vneg.s16 q15, q15
+ vshl.i16 q15, q15, #11
+.endif
+.ifc \type, mask
+ vmov.i8 q15, #256-2
+.endif
+ adr r12, L(\type\()_tbl)
+ sub r4, r4, #24
+ ldr r4, [r12, r4, lsl #2]
+ \type d16, d17, q0, q1, q2, q3
+ add r12, r12, r4
+ bx r12
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(\type\()_tbl) + CONFIG_THUMB
+
+4:
+ add r6, r0, r1
+ lsl r1, r1, #1
+ cmp r5, #4
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ beq 0f
+ \type d18, d19, q0, q1, q2, q3
+ cmp r5, #8
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ beq 0f
+ \type d16, d17, q0, q1, q2, q3
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ pop {r4-r6,pc}
+80:
+ add r6, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.8 {d16}, [r0, :64], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {d17}, [r6, :64], r1
+ vst1.8 {d18}, [r0, :64], r1
+ subs r5, r5, #4
+ vst1.8 {d19}, [r6, :64], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 8b
+160:
+ add r6, r0, r1
+ lsl r1, r1, #1
+16:
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {q8}, [r0, :128], r1
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q9}, [r6, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q10}, [r0, :128], r1
+ subs r5, r5, #4
+ vst1.8 {q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 16b
+320:
+ add r6, r0, r1
+ lsl r1, r1, #1
+32:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 32b
+640:
+ add r6, r0, #32
+64:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r6, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 64b
+1280:
+ sub r1, r1, #32
+ add r6, r0, #64
+128:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128]!
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r0, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r6, :128]!
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #1
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 128b
+
+0:
+ pop {r4-r6,pc}
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+ clz r8, r4
+ adr r9, L(w_mask_\type\()_tbl)
+ sub r8, r8, #24
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ movw r12, #6903
+ vdup.16 q14, r12
+.if \type == 444
+ vmov.i8 q15, #64
+.elseif \type == 422
+ vdup.8 d0, r7 // d0[] <- sign
+ vmov.i8 d30, #129
+ vsub.i8 d30, d30, d0 // 129 - sign
+.elseif \type == 420
+ vdup.16 q0, r7 // d0[] <- sign
+ vmov.i16 q15, #256
+ vsub.i16 q15, q15, q0 // 256 - sign
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vsub.i16 q8, q2, q0 // tmp2-tmp1
+ vsub.i16 q9, q3, q1
+ vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x]))
+ vabd.s16 q11, q1, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs ()
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64-m)<<9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15
+ vqdmulh.s16 q13, q13, q9
+ vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1
+ vadd.i16 q13, q13, q1
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d6, q10
+ vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
+ vst1.8 {d6}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 d21, d22, d23
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r12, :32], r1
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r9,pc}
+8:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2
+ subs r5, r5, #2
+ vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
+ vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1)
+ vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2)
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2)
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9 // (64 - my2) << 9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r9,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r9, r3, r4, lsl #1
+ add r7, r2, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2
+ subs r8, r8, #16
+ vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q3, q3, q1
+ vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1)
+ vabs.s16 q11, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q3
+ vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.i16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - my1
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // my1
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
+ vsub.i16 q1, q1, q9
+ vst1.16 {d24, d25}, [r0, :128]! // store dsty1
+ vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
+ vabs.s16 q3, q1
+ vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
+ vqsub.u16 q3, q14, q3
+ vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8
+ vshr.s16 q3, q3, #8
+ vshl.s16 q12, q2, #9 // (64 - my2) << 9
+ vshl.s16 q13, q3, #9
+.if \type == 444
+ vmovn.u16 d4, q2 // 64 - my2
+ vmovn.u16 d5, q3
+ vsub.i8 q2, q15, q2 // my2
+ vst1.8 {d4, d5}, [lr, :128]!
+.elseif \type == 422
+ vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
+ vpadd.s16 d5, d6, d7
+ vmovn.s16 d4, q2
+ vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
+ vst1.8 {d4}, [lr, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 q11, q11, q3
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vqdmulh.s16 q13, q13, q1
+ vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vadd.i16 q13, q13, q9
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vst1.16 {d24, d25}, [r12, :128]! // store dsty2
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.u8 {d2}, [r5, :64]!
+ vld1.u8 {d1}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ subs r4, r4, #2
+ vld1.32 {d0[1]}, [r12, :32]
+ vsub.i8 d3, d22, d2
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d3
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 d16, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.u8 {q1}, [r5, :128]!
+ vld1.u8 {q2}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 d17, d16, d2
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i8 d18, d16, d3
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d18
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.u8 {q1, q2}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vsub.i8 q15, q12, q1
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d16, d2
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q14, d17, d3
+ vmlal.u8 q14, d1, d31
+ vsub.i8 q15, q12, q2
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q14, #6
+ vmull.u8 q3, d18, d4
+ vmlal.u8 q3, d26, d30
+ vmull.u8 q14, d19, d5
+ vmlal.u8 q14, d27, d31
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q14, #6
+ vst1.u8 {q10}, [r0, :128], r1
+ vst1.u8 {q11}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ vmov.i8 q10, #64
+32:
+ vld1.u8 {q2, q3}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ subs r4, r4, #1
+ vsub.i8 q11, q10, q2
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vsub.i8 q11, q10, q3
+ vrshrn.i16 d24, q15, #6
+ vrshrn.i16 d25, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d22
+ vmull.u8 q14, d19, d7
+ vmlal.u8 q14, d3, d23
+ vrshrn.i16 d26, q15, #6
+ vrshrn.i16 d27, q14, #6
+ vst1.u8 {q12, q13}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld1.16 {d2[], d3[]}, [r5, :16]!
+ vld1.32 {d1[]}, [r2, :32]!
+ subs r4, r4, #2
+ vld1.16 {d0[]}, [r0, :16]
+ vzip.8 d2, d3
+ vsub.i8 d4, d22, d2
+ vld1.16 {d0[1]}, [r12, :16]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d4
+ vrshrn.i16 d20, q8, #6
+ vst1.16 {d20[0]}, [r0, :16], r1
+ vst1.16 {d20[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d1}, [r2, :64]!
+ subs r4, r4, #2
+ vext.u8 d2, d2, d3, #4
+ vld1.32 {d0[]}, [r0, :32]
+ vsub.i8 d6, d22, d2
+ vld1.32 {d0[1]}, [r12, :32]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d6
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 q8, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 q9, q8, q1
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d18
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d19
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.u8 {d28[], d29[]}, [r5, :16]!
+ vld1.u8 {d2, d3, d4, d5}, [r2, :128]!
+ vsub.i8 q15, q12, q14
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q8, d3, d28
+ vmlal.u8 q8, d1, d30
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d29
+ vmlal.u8 q3, d26, d31
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d31
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {q9}, [r0, :128], r1
+ vst1.u8 {q10}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+640:
+1280:
+ vmov.i8 d20, #64
+ sub r1, r1, r3
+321:
+ vld1.u8 {d6[]}, [r5]!
+ vsub.i8 d7, d20, d6
+ mov r12, r3
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ vmull.u8 q15, d16, d6
+ vmlal.u8 q15, d0, d7
+ vmull.u8 q14, d17, d6
+ vmlal.u8 q14, d1, d7
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d7
+ vmull.u8 q14, d19, d6
+ vmlal.u8 q14, d3, d7
+ vrshrn.i16 d2, q15, #6
+ vrshrn.i16 d3, q14, #6
+ subs r12, r12, #32
+ vst1.u8 {q0, q1}, [r0, :128]!
+ bgt 32b
+ add r0, r0, r1
+ subs r4, r4, #1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ vld1.8 {d2[]}, [lr]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d3, d22, d2
+2:
+ vld1.16 {d1[0]}, [r2, :16]!
+ vld1.8 {d0[]}, [r0]
+ subs r4, r4, #2
+ vld1.8 {d1[1]}, [r2]
+ vld1.8 {d0[1]}, [r12]
+ vmull.u8 q2, d1, d2
+ vmlal.u8 q2, d0, d3
+ vrshrn.i16 d6, q2, #6
+ add r2, r2, #2
+ vst1.8 {d6[0]}, [r0], r1
+ vst1.8 {d6[1]}, [r12], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vmov.i8 d22, #64
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d5, d22, d4
+ sub r1, r1, #2
+4:
+ vld1.u8 {d2}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d5
+ vrshrn.i16 d20, q3, #6
+ vst1.16 {d20[0]}, [r0, :16]!
+ vst1.16 {d20[2]}, [r12, :16]!
+ vst1.8 {d20[2]}, [r0], r1
+ vst1.8 {d20[6]}, [r12], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vmov.i8 d16, #64
+ vld1.u8 {d2}, [lr, :64]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d17, d16, d2
+ sub r1, r1, #4
+8:
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d2, d5
+ vmlal.u8 q10, d1, d17
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.32 {d22[0]}, [r0, :32]!
+ vst1.32 {d23[0]}, [r12, :32]!
+ vst1.16 {d22[2]}, [r0, :16], r1
+ vst1.16 {d23[2]}, [r12, :16], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vmov.i8 q12, #64
+ vld1.u8 {q14}, [lr, :128]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 q11, q12, q14
+ sub r1, r1, #8
+16:
+ vld1.u8 {q1, q2}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d22
+ vmull.u8 q8, d3, d29
+ vmlal.u8 q8, d1, d23
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d26, d22
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d23
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {d18}, [r0, :64]!
+ vst1.u8 {d20}, [r12, :64]!
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r12, :32], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vmov.i8 q10, #64
+ vld1.u8 {q2, q3}, [lr, :128]
+ vsub.i8 q11, q10, q2
+ vsub.i8 d24, d20, d6
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #1
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d24
+ vrshrn.i16 d2, q15, #6
+ vst1.u8 {d0, d1, d2}, [r0, :64], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (clz(w)-24).
+function put_neon
+ adr r9, L(put_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 32f - L(put_tbl) + CONFIG_THUMB
+ .word 160f - L(put_tbl) + CONFIG_THUMB
+ .word 8f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.16 {d0[]}, [r2], r3
+ vld1.16 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d1[0]}, [r0, :16], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d1}, [r0, :64], r1
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+16:
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r8, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+32:
+ vld1.8 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q0, q1}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #32
+ sub r3, r3, #32
+64:
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #96
+ sub r3, r3, #96
+128:
+ vld1.8 {q8, q9}, [r2]!
+ vst1.8 {q8, q9}, [r0, :128]!
+ vld1.8 {q10, q11}, [r2]!
+ vst1.8 {q10, q11}, [r0, :128]!
+ vld1.8 {q12, q13}, [r2]!
+ vst1.8 {q12, q13}, [r0, :128]!
+ vld1.8 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r4,
+// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
+function prep_neon
+ adr r9, L(prep_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 160f - L(prep_tbl) + CONFIG_THUMB
+ .word 8f - L(prep_tbl) + CONFIG_THUMB
+ .word 4f - L(prep_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r1], r2
+ vld1.32 {d2[]}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {d1, d2}, [r0, :64]!
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r9, r1, r2
+ lsl r2, r2, #1
+ add r8, r0, r7
+ lsl r7, r7, #1
+16:
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r9], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d4, #4
+ vshll.u8 q1, d5, #4
+ vshll.u8 q2, d6, #4
+ vshll.u8 q3, d7, #4
+ vst1.16 {q0, q1}, [r0, :128], r7
+ vst1.16 {q2, q3}, [r8, :128], r7
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ add r8, r0, r3
+32:
+ vld1.8 {q0, q1}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r7
+ vshll.u8 q13, d5, #4
+ vst1.16 {q10, q11}, [r8, :128], r7
+ vshll.u8 q14, d6, #4
+ vst1.16 {q12, q13}, [r0, :128], r7
+ vshll.u8 q15, d7, #4
+ vst1.16 {q14, q15}, [r8, :128], r7
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #32
+ add r8, r0, #32
+ mov r6, #64
+64:
+ vld1.8 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r6
+ vshll.u8 q13, d5, #4
+ vshll.u8 q14, d6, #4
+ vst1.16 {q10, q11}, [r8, :128], r6
+ vshll.u8 q15, d7, #4
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #96
+ add r8, r0, #32
+ mov r6, #64
+128:
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q2, q3}, [r1]!
+ vshll.u8 q10, d0, #4
+ vshll.u8 q11, d1, #4
+ vshll.u8 q12, d2, #4
+ vshll.u8 q13, d3, #4
+ vshll.u8 q14, d4, #4
+ vshll.u8 q15, d5, #4
+ vld1.8 {q8, q9}, [r1]!
+ vst1.16 {q10, q11}, [r0, :128], r6
+ vst1.16 {q12, q13}, [r8, :128], r6
+ vshll.u8 q0, d6, #4
+ vshll.u8 q1, d7, #4
+ vshll.u8 q2, d16, #4
+ vshll.u8 q3, d17, #4
+ vshll.u8 q8, d18, #4
+ vshll.u8 q9, d19, #4
+ vld1.8 {q10, q11}, [r1], r2
+ vst1.16 {q14, q15}, [r0, :128], r6
+ vst1.16 {q0, q1}, [r8, :128], r6
+ vshll.u8 q12, d20, #4
+ vshll.u8 q13, d21, #4
+ vshll.u8 q14, d22, #4
+ vshll.u8 q15, d23, #4
+ subs r4, r4, #1
+ vst1.16 {q2, q3}, [r0, :128], r6
+ vst1.16 {q8, q9}, [r8, :128], r6
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.8 {\d0}, [\s0], \strd
+ vld1.8 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.8 {\d2}, [\s0], \strd
+ vld1.8 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.8 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.8 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.8 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1_16 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #6
+ vext.8 \r1, \r1, \r2, #6
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #6
+ vext.8 \r3, \r3, \r4, #6
+.endif
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
+ vmovl.u8 \q0, \d0
+ vmovl.u8 \q1, \d1
+.ifnb \q2
+ vmovl.u8 \q2, \d2
+ vmovl.u8 \q3, \d3
+.endif
+.ifnb \q4
+ vmovl.u8 \q4, \d4
+.endif
+.ifnb \q5
+ vmovl.u8 \q5, \d5
+.endif
+.ifnb \q6
+ vmovl.u8 \q6, \d6
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3
+ vmul.s16 \d, \s0, d0[0]
+ vmla.s16 \d, \s1, d0[1]
+ vmla.s16 \d, \s2, d0[2]
+ vmla.s16 \d, \s3, d0[3]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s1, d0[0]
+ vmla.s16 \d1, \s2, d0[1]
+ vmla.s16 \d1, \s3, d0[2]
+ vmla.s16 \d1, \s4, d0[3]
+ vmla.s16 \d1, \s5, d1[0]
+ vmla.s16 \d1, \s6, d1[1]
+ vmla.s16 \d1, \s7, d1[2]
+ vmla.s16 \d1, \s8, d1[3]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s2, d0[0]
+ vmla.s16 \d1, \s3, d0[1]
+ vmla.s16 \d1, \s4, d0[2]
+ vmla.s16 \d1, \s5, d0[3]
+ vmla.s16 \d1, \s6, d1[0]
+ vmla.s16 \d1, \s7, d1[1]
+ vmla.s16 \d1, \s8, d1[2]
+ vmla.s16 \d1, \s9, d1[3]
+.endm
+.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s16 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s16 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s16 \d2, \q2, #\shift
+ vqrshrun.s16 \d3, \q3, #\shift
+.endif
+.endm
+.macro vrshr_s16 shift, r0, r1, r2, r3
+ vrshr.s16 \r0, \r0, #\shift
+.ifnb \r1
+ vrshr.s16 \r1, \r1, #\shift
+.endif
+.ifnb \r2
+ vrshr.s16 \r2, \r2, #\shift
+ vrshr.s16 \r3, \r3, #\shift
+.endif
+.endm
+.macro st_16 strd, reg, lanes
+ vst1.16 {\reg[0]}, [r0, :16], \strd
+ vst1.16 {\reg[1]}, [r8, :16], \strd
+.if \lanes > 2
+ vst1.16 {\reg[2]}, [r0, :16], \strd
+ vst1.16 {\reg[3]}, [r8, :16], \strd
+.endif
+.endm
+.macro st_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r8, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r8, :32], \strd
+.endif
+.endm
+.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.8 {\r0}, [r0, \align], \strd
+ vst1.8 {\r1}, [r8, \align], \strd
+.ifnb \r2
+ vst1.8 {\r2}, [r0, \align], \strd
+ vst1.8 {\r3}, [r8, \align], \strd
+.endif
+.ifnb \r4
+ vst1.8 {\r4}, [r0, \align], \strd
+ vst1.8 {\r5}, [r8, \align], \strd
+ vst1.8 {\r6}, [r0, \align], \strd
+ vst1.8 {\r7}, [r8, \align], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d2
+ st_32 \strd, \d0, \d2
+.else
+ vrshr_s16 2, \q0, \q1
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.endif
+.endm
+.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ st_reg \strd, :128,\q0, \q1, \q2, \q3
+.endif
+.endm
+.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
+.ifc \type, put
+ vqrshrun.s16 \d0, \q0, #6
+ vqrshrun.s16 \d1, \q1, #6
+ vqrshrun.s16 \d4, \q2, #6
+ vqrshrun.s16 \d5, \q3, #6
+ st_reg \strd, :128, \q0, \q2
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+ vst1.16 {\q2, \q3}, [r8, :128], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r8, \type_h
+ movw r9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r10
+ mul \my, \my, r10
+ add \mx, \mx, r8 // mx, 8tap_h, 4tap_h
+ add \my, \my, r9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ clz r8, \w
+ tst \mx, #(0x7f << 14)
+ sub r8, r8, #24
+ movrel r10, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r9
+ tst \my, #(0x7f << 14)
+ add \mx, r10, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r9, L(\type\()_8tap_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vmovl.u8 q2, d4
+ vmovl.u8 q3, d6
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmul.s16 d2, d4, d0[0]
+ vmla.s16 d2, d5, d0[1]
+ vmla.s16 d2, d6, d0[2]
+ vmla.s16 d2, d7, d0[3]
+ vrshr.s16 d2, d2, #2
+ vqrshrun.s16 d2, q1, #4
+ vst1.16 {d2[0]}, [\dst, :16], \d_strd
+ vst1.16 {d2[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.8 {d16}, [\src], \s_strd
+ vld1.8 {d24}, [\sr2], \s_strd
+ vmovl.u8 q8, d16
+ vmovl.u8 q12, d24
+ vext.8 d18, d16, d17, #2
+ vext.8 d20, d16, d17, #4
+ vext.8 d22, d16, d17, #6
+ vext.8 d26, d24, d25, #2
+ vext.8 d28, d24, d25, #4
+ vext.8 d30, d24, d25, #6
+ subs \h, \h, #2
+ vmul.s16 d4, d16, d0[0]
+ vmla.s16 d4, d18, d0[1]
+ vmla.s16 d4, d20, d0[2]
+ vmla.s16 d4, d22, d0[3]
+ vmul.s16 d5, d24, d0[0]
+ vmla.s16 d5, d26, d0[1]
+ vmla.s16 d5, d28, d0[2]
+ vmla.s16 d5, d30, d0[3]
+ vrshr.s16 q2, q2, #2
+.ifc \type, put
+ vqrshrun.s16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q12}, [\sr2], \s_strd
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+ vmul.s16 q10, q8, d0[0]
+ vmul.s16 q14, q12, d0[0]
+.irpc i, 1234567
+ vext.8 q11, q8, q9, #(2*\i)
+ vext.8 q15, q12, q13, #(2*\i)
+.if \i < 4
+ vmla.s16 q10, q11, d0[\i]
+ vmla.s16 q14, q15, d0[\i]
+.else
+ vmla.s16 q10, q11, d1[\i-4]
+ vmla.s16 q14, q15, d1[\i-4]
+.endif
+.endr
+ subs \h, \h, #2
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q14, q14, #2
+.ifc \type, put
+ vqrshrun.s16 d20, q10, #4
+ vqrshrun.s16 d28, q14, #4
+ vst1.8 {d20}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q10}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ // This could be done without touching q4-q6, by using only
+ // one temporary for vext in the loop. That's slower on A7 and A53,
+ // (but surprisingly, marginally faster on A8 and A73).
+ vpush {q4-q6}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16, d17, d18}, [\src]!
+ vld1.8 {d24, d25, d26}, [\sr2]!
+ mov \mx, \w
+ vmovl.u8 q10, d18
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q14, d26
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+16:
+ vmul.s16 q1, q8, d0[0]
+ vmul.s16 q2, q9, d0[0]
+ vmul.s16 q3, q12, d0[0]
+ vmul.s16 q4, q13, d0[0]
+.irpc i, 1234567
+ vext.8 q5, q8, q9, #(2*\i)
+ vext.8 q6, q9, q10, #(2*\i)
+ vext.8 q11, q12, q13, #(2*\i)
+ vext.8 q15, q13, q14, #(2*\i)
+.if \i < 4
+ vmla.s16 q1, q5, d0[\i]
+ vmla.s16 q2, q6, d0[\i]
+ vmla.s16 q3, q11, d0[\i]
+ vmla.s16 q4, q15, d0[\i]
+.else
+ vmla.s16 q1, q5, d1[\i-4]
+ vmla.s16 q2, q6, d1[\i-4]
+ vmla.s16 q3, q11, d1[\i-4]
+ vmla.s16 q4, q15, d1[\i-4]
+.endif
+.endr
+ vrshr.s16 q1, q1, #2
+ vrshr.s16 q2, q2, #2
+ vrshr.s16 q3, q3, #2
+ vrshr.s16 q4, q4, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrun.s16 d2, q1, #4
+ vqrshrun.s16 d3, q2, #4
+ vqrshrun.s16 d4, q3, #4
+ vqrshrun.s16 d5, q4, #4
+ vst1.8 {q1}, [\dst, :128]!
+ vst1.8 {q2}, [\ds2, :128]!
+.else
+ vst1.16 {q1, q2}, [\dst, :128]!
+ vst1.16 {q3, q4}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov q8, q10
+ vmov q12, q14
+ vld1.8 {d18, d19}, [\src]!
+ vld1.8 {d26, d27}, [\sr2]!
+ vmovl.u8 q10, d19
+ vmovl.u8 q9, d18
+ vmovl.u8 q14, d27
+ vmovl.u8 q13, d26
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q6}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_16 d1, d2, d3, d4, d5
+ bgt 24f
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 d6, d16, d18, d20, d22
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 2
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_16 \sr2, \src, \s_strd, d6, d7
+ interleave_1_16 d5, d6, d7
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
+ vmov d17, d20
+ vmov d19, d22
+ vmov d21, d24
+ vmov d23, d26
+ mul_mla_4 q3, q8, q9, q10, q11
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 4
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14
+ interleave_1_16 d2, d4, d6, d8, d10
+ interleave_1_16 d10, d12, d14
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12
+ vmov d3, d6
+ vmov d5, d8
+ vmov d7, d10
+ vmov d9, d12
+216:
+ subs \h, \h, #4
+ load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
+ interleave_1_16 d14, d16, d18, d20, d22
+ vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
+ vmov d11, d14
+ vmov d13, d16
+ vmov d15, d18
+ vmov d17, d20
+ mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 4
+ ble 0f
+ cmp \h, #2
+ vmov q1, q5
+ vmov q2, q6
+ vmov q3, q7
+ vmov q4, q8
+ vmov q5, q9
+ vmov q6, q10
+ vmov d14, d22
+ beq 26f
+ b 216b
+26:
+ load_16 \sr2, \src, \s_strd, d16, d18
+ interleave_1_16 d14, d16, d18
+ vmovl_u8 q7, d14, q8, d16
+ vmov d11, d14
+ vmov d13, d16
+ mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 2
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 q3, q8, q9, q10, q11
+ shift_store_4 \type, \d_strd, q3, d6, d7
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmovl_u8 q12, d5, q13, d6
+ mul_mla_4 q3, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q3, d6, d7
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ interleave_1_32 d2, d4, d6
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18
+
+48:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d22, d24, d26, d28
+ interleave_1_32 d20, d22, d24, d26, d28
+ vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26
+ mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d30, d2
+ subs \h, \h, #2
+ interleave_1_32 d28, d30, d2
+ vmovl_u8 q14, d28, q15, d30
+ mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15
+ shift_store_4 \type, \d_strd, q8, d16, d17
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d4, d6
+ subs \h, \h, #2
+ interleave_1_32 d2, d4, d6
+ vmovl_u8 q1, d2, q2, d4
+ mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2
+ shift_store_4 \type, \d_strd, q9, d18, d19
+ ble 0f
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
+ mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
+ shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
+ bgt 48b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5
+ mul_mla_4 q1, q8, q9, q10, q11
+ mul_mla_4 q2, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmovl_u8 q13, d6, q14, d7
+ mul_mla_4 q1, q10, q11, q12, q13
+ mul_mla_4 q2, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d22, d24
+ vmovl_u8 q11, d22, q12, d24
+ mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d26, d28
+ vmovl_u8 q13, d26, q14, d28
+ mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q3, d6, q4, d8
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d30, d2
+ vmovl_u8 q15, d30, q1, d2
+ mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
+ shift_store_8 \type, \d_strd, q8, d16, q9, d18
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d4, d6
+ vmovl_u8 q2, d4, q3, d6
+ mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
+ shift_store_8 \type, \d_strd, q10, d20, q11, d22
+ ble 9f
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
+ vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20
+ mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8
+ mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10
+ shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
+ bgt 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ cmp \h, #2
+ load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15
+ vmovl.u8 q1, d22
+ vmovl.u8 q2, d24
+ vmovl.u8 q3, d26
+ vmovl.u8 q8, d28
+ vmovl.u8 q9, d30
+ vmovl.u8 q11, d23
+ vmovl.u8 q12, d25
+ vmovl.u8 q13, d27
+ vmovl.u8 q14, d29
+ vmovl.u8 q15, d31
+ mul_mla_4 q1, q1, q2, q3, q8
+ mul_mla_4 q10, q2, q3, q8, q9
+ mul_mla_4 q2, q11, q12, q13, q14
+ mul_mla_4 q11, q12, q13, q14, q15
+ shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmovl.u8 q1, d20
+ vmovl.u8 q10, d21
+ vmovl.u8 q12, d22
+ vmovl.u8 q11, d23
+ mul_mla_4 q2, q3, q8, q9, q1
+ mul_mla_4 q3, q13, q14, q15, q10
+ mul_mla_4 q13, q8, q9, q1, q12
+ mul_mla_4 q14, q14, q15, q10, q11
+ shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d26, #4
+ vmov d19, d26
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d26, #4
+ vmov d21, d26
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ b 28b
+
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+ vmov d27, d28
+ vmovl.u8 q14, d30
+ vmovl.u8 q15, d31
+ vtrn.32 d26, d28
+ vtrn.32 d27, d30
+ vmul.s16 d26, d26, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d28, d0[2]
+ vmla.s16 d26, d30, d0[3]
+ vrshr.s16 d26, d26, #2
+ vext.8 d27, d26, d26, #4
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d26, d2[2]
+ vmlal.s16 q3, d27, d2[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ vmov d18, d27
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d19, d26
+ vmov d20, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d21, d26
+ vmov d22, d27
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d19, d2[2]
+ vmlal.s16 q3, d20, d2[3]
+ vmlal.s16 q3, d21, d3[0]
+ vmlal.s16 q3, d22, d3[1]
+ vmlal.s16 q3, d26, d3[2]
+ vmlal.s16 q3, d27, d3[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ vmov d22, d27
+ b 48b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.8 {d30}, [\sr2], \s_strd
+ vld1.8 {d31}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d1, d0[3]
+
+ vmovl.u8 q14, d31
+ vext.8 d30, d28, d29, #2
+ vext.8 d31, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d27, d28, d0[0]
+ vmla.s16 d27, d30, d0[1]
+ vmla.s16 d27, d31, d0[2]
+ vmla.s16 d27, d1, d0[3]
+ vrshr.s16 d26, d26, #2
+ vrshr.s16 d27, d27, #2
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ vpush {q4-q7}
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d20, d2[2]
+ vmlal.s16 q15, d21, d2[2]
+ vmlal.s16 q12, d20, d2[3]
+ vmlal.s16 q13, d21, d2[3]
+ vmlal.s16 q14, d22, d2[3]
+ vmlal.s16 q15, d23, d2[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q10
+ vmov q5, q11
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q6, q10
+ vmov q7, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q8, q10
+ vmov q9, q11
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d12, d2[2]
+ vmlal.s16 q15, d13, d2[2]
+ vmlal.s16 q12, d12, d2[3]
+ vmlal.s16 q13, d13, d2[3]
+ vmlal.s16 q14, d14, d2[3]
+ vmlal.s16 q15, d15, d2[3]
+ vmlal.s16 q12, d14, d3[0]
+ vmlal.s16 q13, d15, d3[0]
+ vmlal.s16 q14, d16, d3[0]
+ vmlal.s16 q15, d17, d3[0]
+ vmlal.s16 q12, d16, d3[1]
+ vmlal.s16 q13, d17, d3[1]
+ vmlal.s16 q14, d18, d3[1]
+ vmlal.s16 q15, d19, d3[1]
+ vmlal.s16 q12, d18, d3[2]
+ vmlal.s16 q13, d19, d3[2]
+ vmlal.s16 q14, d20, d3[2]
+ vmlal.s16 q15, d21, d3[2]
+ vmlal.s16 q12, d20, d3[3]
+ vmlal.s16 q13, d21, d3[3]
+ vmlal.s16 q14, d22, d3[3]
+ vmlal.s16 q15, d23, d3[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.8 {q14}, [\sr2], \s_strd
+ vld1.8 {q15}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vmovl.u8 q12, d30
+ vmovl.u8 q13, d31
+ vmul.s16 q11, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d1[\i-4]
+.endr
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q11, q11, #2
+ bx lr
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ vdup.8 d1, \mx
+ vdup.8 d3, \my
+ rsb r8, \mx, #16
+ rsb r9, \my, #16
+ vdup.8 d0, r8
+ vdup.8 d2, r9
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz r8, \w
+ cmp \mx, #0
+ sub r8, r8, #24
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r9, L(\type\()_bilin_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.32 {d4[]}, [\src], \s_strd
+ vld1.32 {d6[]}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.16 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+ vqrshrn.u16 d4, q3, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.32 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+.ifc \type, put
+ vqrshrn.u16 d4, q3, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d6}, [\dst, :64], \d_strd
+ vst1.16 {d7}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #1
+ vext.8 q11, q10, q10, #1
+ subs \h, \h, #2
+ vmull.u8 q8, d16, d0
+ vmull.u8 q10, d20, d0
+ vmlal.u8 q8, d18, d1
+ vmlal.u8 q10, d22, d1
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #4
+ vqrshrn.u16 d18, q10, #4
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16}, [\src]!
+ vld1.8 {d22}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.8 {d17,d18}, [\src]!
+ vld1.8 {d23,d24}, [\sr2]!
+ vext.8 q10, q8, q9, #1
+ vext.8 q13, q11, q12, #1
+ vmull.u8 q2, d16, d0
+ vmull.u8 q3, d17, d0
+ vmull.u8 q14, d22, d0
+ vmull.u8 q15, d23, d0
+ vmlal.u8 q2, d20, d1
+ vmlal.u8 q3, d21, d1
+ vmlal.u8 q14, d26, d1
+ vmlal.u8 q15, d27, d1
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d5, q3, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q2}, [\dst, :128]!
+ vst1.8 {q14}, [\ds2, :128]!
+.else
+ vst1.16 {q2, q3}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov d16, d18
+ vmov d22, d24
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r9, L(\type\()_bilin_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.16 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16]
+ vst1.16 {d4[1]}, [\ds2, :16]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vld1.16 {d19[]}, [\sr2], \s_strd
+ vld1.16 {d20[]}, [\src], \s_strd
+ sub \h, \h, #4
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vext.8 d18, d18, d19, #6
+ vext.8 d19, d19, d20, #6
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ cmp \h, #2
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ vst1.16 {d4[2]}, [\dst, :16], \d_strd
+ vst1.16 {d4[3]}, [\ds2, :16], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.32 {d16[]}, [\src], \s_strd
+4:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.8 {d16}, [\src], \s_strd
+8:
+ vld1.8 {d17}, [\sr2], \s_strd
+ vld1.8 {d18}, [\src], \s_strd
+ vmull.u8 q2, d16, d2
+ vmull.u8 q3, d17, d2
+ vmlal.u8 q2, d17, d3
+ vmlal.u8 q3, d18, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d6, q3, #4
+ vst1.8 {d4}, [\dst, :64], \d_strd
+ vst1.8 {d6}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q8}, [\src], \s_strd
+2:
+ vld1.8 {q9}, [\sr2], \s_strd
+ vld1.8 {q10}, [\src], \s_strd
+ vmull.u8 q12, d16, d2
+ vmull.u8 q13, d17, d2
+ vmull.u8 q14, d18, d2
+ vmull.u8 q15, d19, d2
+ vmlal.u8 q12, d18, d3
+ vmlal.u8 q13, d19, d3
+ vmlal.u8 q14, d20, d3
+ vmlal.u8 q15, d21, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d24, q12, #4
+ vqrshrn.u16 d25, q13, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q12}, [\dst, :128], \d_strd
+ vst1.8 {q14}, [\ds2, :128], \d_strd
+.else
+ vst1.16 {q12, q13}, [\dst, :128], \d_strd
+ vst1.16 {q14, q15}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ vmovl.u8 q2, d2
+ vmovl.u8 q3, d3
+ adr r9, L(\type\()_bilin_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.32 {d28[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+2:
+ vld1.32 {d28[]}, [\sr2], \s_strd
+ vld1.32 {d30[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.16 d28, d30
+ vtrn.16 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vtrn.32 d16, d18
+
+ vmul.u16 d20, d16, d4
+ vmla.u16 d20, d19, d6
+ vqrshrn.u16 d20, q10, #8
+ subs \h, \h, #2
+ vst1.16 {d20[0]}, [\dst, :16], \d_strd
+ vst1.16 {d20[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vtrn.32 d19, d16
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {d28}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+4:
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vmov d17, d18
+
+ vmul.u16 q10, q8, q2
+ vmla.u16 q10, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d20, q10, #8
+ vst1.32 {d20[0]}, [\dst, :32], \d_strd
+ vst1.32 {d20[1]}, [\ds2, :32], \d_strd
+.else
+ vrshr.u16 q10, q10, #4
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q12}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vmull.u8 q8, d24, d0
+ vmlal.u8 q8, d26, d1
+
+2:
+ vld1.8 {q12}, [\sr2], \s_strd
+ vld1.8 {q14}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vext.8 q15, q14, q14, #1
+ vmull.u8 q9, d24, d0
+ vmlal.u8 q9, d26, d1
+ vmull.u8 q10, d28, d0
+ vmlal.u8 q10, d30, d1
+
+ vmul.u16 q8, q8, q2
+ vmla.u16 q8, q9, q3
+ vmul.u16 q9, q9, q2
+ vmla.u16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #8
+ vqrshrn.u16 d18, q9, #8
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
+filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q7}, [r2], r3
+ vmov.i8 q6, #128
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d1, r5, r7 // filter 1
+ load_filter_row d2, r5, r7 // filter 2
+ load_filter_ptr r5 // filter 3
+ veor q7, q7, q6 // subtract by 128 to allow using vmull
+ load_filter_coef d3, r5, r7 // filter 3
+ vext.8 d12, d14, d15, #1 // filter 1 pixels
+ vext.8 d13, d14, d15, #2 // filter 2 pixels
+ load_filter_ptr r5 // filter 4
+ vmull.s8 q2, d14, d0 // filter 0 output
+ vmull.s8 q3, d12, d1 // filter 1 output
+ load_filter_coef d0, r5, r7 // filter 4
+ load_filter_ptr r5 // filter 5
+ vext.8 d12, d14, d15, #3 // filter 3 pixels
+ vmull.s8 q4, d13, d2 // filter 2 output
+ vext.8 d13, d14, d15, #4 // filter 4 pixels
+ vpadd.i16 d4, d4, d5 // pixel 0 (4x16)
+ vpadd.i16 d5, d6, d7 // pixel 1 (4x16)
+ load_filter_coef d1, r5, r7 // filter 5
+ load_filter_ptr r5 // filter 6
+ vmull.s8 q5, d12, d3 // filter 3 output
+ vext.8 d12, d14, d15, #5 // filter 5 pixels
+ vmull.s8 q3, d13, d0 // filter 4 output
+ load_filter_coef d0, r5, r7 // filter 6
+ vext.8 d13, d14, d15, #6 // filter 6 pixels
+ load_filter_ptr r5 // filter 7
+ vpadd.i16 d8, d8, d9 // pixel 2 (4x16)
+ vpadd.i16 d9, d10, d11 // pixel 3 (4x16)
+ vmull.s8 q5, d12, d1 // filter 5 output
+ load_filter_coef d1, r5, r7 // filter 7
+ vext.8 d14, d14, d15, #7 // filter 7 pixels
+ vpadd.i16 d6, d6, d7 // pixel 4 (4x16)
+ vpadd.i16 d10, d10, d11 // pixel 5 (4x16)
+ vmull.s8 q6, d13, d0 // filter 6 output
+ vmull.s8 q7, d14, d1 // filter 7 output
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16)
+ vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16)
+ vpadd.i16 d12, d12, d13 // pixel 6 (4x16)
+ vpadd.i16 d14, d14, d15 // pixel 7 (4x16)
+ vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16)
+ vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16)
+ vpadd.i16 d4, d4, d5 // pixel 0-3
+ vpadd.i16 d5, d6, d10 // pixel 4-7
+
+ add r5, r5, r8
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #3
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vrshr.s16 q8, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q9, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q10, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q11, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q12, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q13, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q14, q2, #3
+
+1:
+ bl warp_filter_horz_neon
+ vrshr.s16 q15, q2, #3
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ vmov.i16 q7, #128
+.else
+ vmov.i16 q7, #0x800
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+ vqrshrn.s32 d0, q0, #\shift
+ vmov q10, q11
+ vqrshrn.s32 d1, q1, #\shift
+ vmov q11, q12
+ vadd.i16 q0, q0, q7
+ vmov q12, q13
+.ifb \t
+ vqmovun.s16 d0, q0
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+.ifnb \t
+ vst1.16 {q0}, [r0, :128], r1
+.else
+ vst1.8 {d0}, [r0, :64], r1
+.endif
+
+ add r6, r6, r4
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.8 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.8 {q0, q1}, [lr]!
+ subs r3, r3, #32
+.if \need_left
+ vst1.8 {q0, q1}, [r12]!
+.else
+ vst1.8 {q0, q1}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2 // in + center_w
+ sub r3, r3, #1 // in + center_w - 1
+ add r12, r6, r4 // dst + left_ext
+ vld1.8 {d0[], d1[]}, [r3]
+ add r12, r12, r2 // out = dst + left_ext + center_w
+ mov r3, r11
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+1:
+ vld1.8 {q0, q1}, [r8, :128]!
+ mov r3, r10
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ mov r3, r5
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S
new file mode 100644
index 0000000000..b7d845e219
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc16.S
@@ -0,0 +1,3658 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ vqadd.s16 q0, q0, q2
+ vqadd.s16 q1, q1, q3
+ vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vshl.s16 \d0, q0, q13 // -(intermediate_bits+1)
+ vshl.s16 \d1, q1, q13 // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q4
+ vmul.s32 \d1, \d1, q4
+ vmul.s32 q1, q1, q4
+ vshr.s32 \d0, \d0, #4
+ vshr.s32 q0, q0, #4
+ vshr.s32 \d1, \d1, #4
+ vshr.s32 q1, q1, #4
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro mask d0, d00, d01, d1, d10, d11
+ vld1.8 {q7}, [r6, :128]!
+ vld1.16 {q0, q1}, [r2, :128]!
+ vneg.s8 q7, q7
+ vld1.16 {q2, q3}, [r3, :128]!
+ vmovl.s8 q6, d14
+ vmovl.s8 q7, d15
+ vmovl.s16 q4, d12
+ vmovl.s16 q5, d13
+ vmovl.s16 q6, d14
+ vmovl.s16 q7, d15
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q5
+ vmul.s32 \d1, \d1, q6
+ vmul.s32 q1, q1, q7
+ vshr.s32 \d0, \d0, #6
+ vshr.s32 q0, q0, #6
+ vshr.s32 \d1, \d1, #6
+ vshr.s32 q1, q1, #6
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ push {r4-r7,lr}
+ ldrd r4, r5, [sp, #20]
+ ldr r6, [sp, #28]
+ clz r4, r4
+.ifnc \type, avg
+ ldr r7, [sp, #32]
+ vmov.i16 q14, #0
+ vdup.16 q15, r7 // bitdepth_max
+.endif
+.ifc \type, w_avg
+ vpush {q4}
+.endif
+.ifc \type, mask
+ vpush {q4-q7}
+.endif
+ clz r7, \bdmax
+ sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov lr, #1
+ movw r12, #2*PREP_BIAS
+ lsl lr, lr, r7 // 1 << intermediate_bits
+ neg r12, r12 // -2*PREP_BIAS
+ add r7, r7, #1
+ sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits
+ neg r7, r7 // -(intermediate_bits+1)
+ vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vdup.16 q13, r7 // -(intermediate_bits+1)
+.else
+ mov r12, #PREP_BIAS
+ lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits
+ neg r7, r7 // -intermediate_bits
+ vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits
+ vdup.16 q13, r7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ vdup.32 q4, r6
+ vneg.s32 q4, q4
+.endif
+ adr r7, L(\type\()_tbl)
+ sub r4, r4, #24
+ \type q8, d16, d17, q9, d18, d19
+ ldr r4, [r7, r4, lsl #2]
+ add r7, r7, r4
+ bx r7
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_tbl) + CONFIG_THUMB
+
+40:
+ add r7, r0, r1
+ lsl r1, r1, #1
+4:
+ subs r5, r5, #4
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r7, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r7, :64], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 4b
+80:
+ add r7, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.16 {q8}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q9}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 8b
+160:
+16:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q10, q11}, [r0, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 16b
+320:
+ add r7, r0, #32
+32:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 32b
+640:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #64
+64:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 64b
+1280:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #192
+128:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 128b
+0:
+.ifc \type, mask
+ vpop {q4-q7}
+.endif
+.ifc \type, w_avg
+ vpop {q4}
+.endif
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+bidir_fn avg, r6
+bidir_fn w_avg, r7
+bidir_fn mask, r7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ push {r4-r10,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
+ ldr r8, [sp, #112]
+ clz r9, r4
+ adr lr, L(w_mask_\type\()_tbl)
+ vdup.16 q15, r8 // bitdepth_max
+ sub r9, r9, #24
+ clz r8, r8 // clz(bitdepth_max)
+ ldr r9, [lr, r9, lsl #2]
+ add r9, lr, r9
+ sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov r10, #PREP_BIAS*64
+ neg r8, r8 // -sh
+ movw r12, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ vdup.32 q14, r8 // -sh
+ vdup.16 q0, r12
+.if \type == 444
+ vmov.i8 q1, #64
+.elseif \type == 422
+ vdup.8 d4, r7
+ vmov.i8 d2, #129
+ vsub.i16 d2, d2, d4
+.elseif \type == 420
+ vdup.16 q2, r7
+ vmov.i16 q1, #0x100
+ vsub.i16 q1, q1, q2
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.i16 d13, d14, d15
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r12, :64], r1
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r12, :64], r1
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+8:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2
+ subs r5, r5, #2
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4, lsl #1
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r7, r2, r4, lsl #1
+ add r9, r3, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {q2}, [r2, :128]! // tmp1
+ vld1.16 {q4}, [r3, :128]! // tmp2
+ vld1.16 {q3}, [r7, :128]!
+ vld1.16 {q5}, [r9, :128]!
+ subs r8, r8, #8
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {d12}, [r6, :64]!
+ vst1.8 {d13}, [lr, :64]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.32 {d12[0]}, [r6, :32]!
+ vst1.32 {d12[1]}, [lr, :32]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+function blend_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.8 {d4}, [r5, :64]!
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vneg.s8 d4, d4 // -m
+ subs r4, r4, #2
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.8 {q8}, [r5, :128]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ subs r4, r4, #2
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ add r12, r0, #32
+32:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #1
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {d2}, [r2, :64]!
+ vext.8 d4, d4, d5, #6
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[1]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {q1}, [r2, :128]!
+ vext.8 d4, d4, d5, #4
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.8 {d16[], d17[]}, [r5, :16]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vld1.16 {q1}, [r12, :128]
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.8 {d24[], d25[]}, [r5, :16]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q13, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vmovl.s8 q12, d26
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q13, d27
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q13
+ vqrdmulh.s16 q11, q11, q13
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vadd.i16 q3, q3, q11
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+1280:
+640:
+320:
+ sub r1, r1, r3, lsl #1
+321:
+ vld1.8 {d24[]}, [r5]!
+ mov r12, r3
+ vneg.s8 d24, d24 // -m
+ vmovl.s8 q12, d24
+ vshl.i16 q12, q12, #9 // -m << 9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r12, r12, #32
+ vld1.16 {q10, q11}, [r2, :128]!
+ vld1.16 {q2, q3}, [r0, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ sub r0, r0, #32
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 q11, q11, q12
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ subs r4, r4, #1
+ add r0, r0, r1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vld1.8 {d4[]}, [lr]
+ vneg.s8 d4, d4 // -m
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+2:
+ vld1.32 {d2[]}, [r2, :32]!
+ vld1.16 {d0[]}, [r0, :16]
+ subs r4, r4, #2
+ vld1.16 {d2[1]}, [r2, :16]
+ vld1.16 {d0[1]}, [r12, :16]
+ add r2, r2, #4
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d0[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ vneg.s8 d4, d4 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q2, d4
+ sub r1, r1, #4
+ vshl.i16 q2, q2, #9 // -m << 9
+4:
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.32 {d0[0]}, [r0, :32]!
+ vst1.32 {d1[0]}, [r12, :32]!
+ vst1.16 {d0[2]}, [r0, :16], r1
+ vst1.16 {d1[2]}, [r12, :16], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vld1.8 {d16}, [lr, :64]
+ add r12, r0, r1
+ vneg.s8 d16, d16 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q8, d16
+ sub r1, r1, #8
+ vshl.i16 q8, q8, #9 // -m << 9
+8:
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ subs r4, r4, #2
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q8
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {d0}, [r0, :64]!
+ vst1.16 {d2}, [r12, :64]!
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d3[0]}, [r12, :32], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vld1.8 {q12}, [lr, :128]
+ add r12, r0, r1
+ vneg.s8 q13, q12 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q12, d26
+ vmovl.s8 q13, d27
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 d26, d26, #9
+16:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #2
+ vld1.16 {q10, q11}, [r2, :128]!
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {d4, d5, d6}, [r12, :64]
+ vsub.i16 d18, d2, d18
+ vsub.i16 q10, q2, q10
+ vsub.i16 d22, d6, d22
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 d18, d18, d26
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 d22, d22, d26
+ vadd.i16 q0, q0, q8
+ vadd.i16 d2, d2, d18
+ vadd.i16 q2, q2, q10
+ vst1.16 {d0, d1, d2}, [r0, :64], r1
+ vadd.i16 d6, d6, d22
+ vst1.16 {d4, d5, d6}, [r12, :64], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vld1.8 {d24, d25, d26}, [lr, :64]
+ vneg.s8 q14, q12 // -m
+ vneg.s8 d30, d26
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ sub r1, r1, #32
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r4, r4, #1
+ vld1.16 {q10}, [r2, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {q2}, [r0, :128]
+ sub r0, r0, #32
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ add r2, r2, #32
+ vst1.16 {q2}, [r0, :128], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that r9 is set to (clz(w)-24).
+function put_neon
+ adr r10, L(put_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 320f - L(put_tbl) + CONFIG_THUMB
+ .word 16f - L(put_tbl) + CONFIG_THUMB
+ .word 80f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.16 {d0}, [r2], r3
+ vld1.16 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+8:
+ vld1.16 {q0}, [r2], r3
+ vld1.16 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r8, :128], r1
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r1, r1, #32
+ sub r3, r3, #32
+32:
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #96
+ sub r3, r3, #96
+64:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #224
+ sub r3, r3, #224
+128:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2]!
+ vst1.16 {q14, q15}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
+// r8 to w*2.
+function prep_neon
+ adr r10, L(prep_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.16 q15, r7 // intermediate_bits
+ vmov.i16 q14, #PREP_BIAS
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 16f - L(prep_tbl) + CONFIG_THUMB
+ .word 80f - L(prep_tbl) + CONFIG_THUMB
+ .word 40f - L(prep_tbl) + CONFIG_THUMB
+
+40:
+ add r9, r1, r2
+ lsl r2, r2, #1
+4:
+ vld1.16 {d0}, [r1], r2
+ vld1.16 {d1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vsub.i16 q0, q0, q14
+ vst1.16 {q0}, [r0, :128]!
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r9, r1, r2
+ lsl r2, r2, #1
+8:
+ vld1.16 {q0}, [r1], r2
+ vld1.16 {q1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vshl.s16 q1, q1, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r1], r2
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ subs r4, r4, #2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r2, r2, #32
+32:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #96
+64:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #224
+128:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1]!
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q0, q1}, [r1]!
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q2, q3}, [r1]!
+ vsub.i16 q11, q11, q14
+ vshl.s16 q0, q0, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.16 {\d0}, [\s0], \strd
+ vld1.16 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.16 {\d2}, [\s0], \strd
+ vld1.16 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.16 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.16 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
+ vld1.16 {\d0, \d1}, [\s0], \strd
+.ifnb \d2
+ vld1.16 {\d2, \d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4, \d5}, [\s0], \strd
+.endif
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmin_u16 c, r0, r1, r2, r3
+ vmin.u16 \r0, \r0, \c
+.ifnb \r1
+ vmin.u16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vmin.u16 \r2, \r2, \c
+ vmin.u16 \r3, \r3, \c
+.endif
+.endm
+.macro vsub_i16 c, r0, r1, r2, r3
+ vsub.i16 \r0, \r0, \c
+.ifnb \r1
+ vsub.i16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vsub.i16 \r2, \r2, \c
+ vsub.i16 \r3, \r3, \c
+.endif
+.endm
+.macro vmull_vmlal_4 d, s0, s1, s2, s3
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+.endm
+.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+ vmlal.s16 \d, \s4, d1[0]
+ vmlal.s16 \d, \s5, d1[1]
+ vmlal.s16 \d, \s6, d1[2]
+ vmlal.s16 \d, \s7, d1[3]
+.endm
+.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s32 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s32 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s32 \d2, \q2, #\shift
+ vqrshrun.s32 \d3, \q3, #\shift
+.endif
+.endm
+.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
+ vmovn.i32 \d0, \q0
+.ifnb \q1
+ vmovn.i32 \d1, \q1
+.endif
+.ifnb \q2
+ vmovn.i32 \d2, \q2
+ vmovn.i32 \d3, \q3
+.endif
+.endm
+.macro vrshl_s32 shift, r0, r1, r2, r3
+ vrshl.s32 \r0, \r0, \shift
+ vrshl.s32 \r1, \r1, \shift
+.ifnb \r2
+ vrshl.s32 \r2, \r2, \shift
+ vrshl.s32 \r3, \r3, \shift
+.endif
+.endm
+.macro vst1_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r9, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r9, :32], \strd
+.endif
+.endm
+.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.16 {\r0}, [r0, \align], \strd
+ vst1.16 {\r1}, [r9, \align], \strd
+.ifnb \r2
+ vst1.16 {\r2}, [r0, \align], \strd
+ vst1.16 {\r3}, [r9, \align], \strd
+.endif
+.ifnb \r4
+ vst1.16 {\r4}, [r0, \align], \strd
+ vst1.16 {\r5}, [r9, \align], \strd
+ vst1.16 {\r6}, [r0, \align], \strd
+ vst1.16 {\r7}, [r9, \align], \strd
+.endif
+.endm
+.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
+.ifc \type, put
+ vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vmin_u16 q15, \q0, \q1
+.else
+ vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
+ vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vsub_i16 q15, \q0, \q1 // PREP_BIAS
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :64, \d0, \d1, \d2, \d3
+.endm
+.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :128, \q0, \q1
+.endm
+.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r9, \type_h
+ movw r10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r11
+ mul \my, \my, r11
+ add \mx, \mx, r9 // mx, 8tap_h, 4tap_h
+ add \my, \my, r10 // my, 8tap_v, 4tap_v
+
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ vdup.16 q15, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ tst \mx, #(0x7f << 14)
+ sub r9, r9, #24
+ add lr, \bdmax, #6 // 6 + intermediate_bits
+ rsb r12, \bdmax, #6 // 6 - intermediate_bits
+ movrel r11, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r10
+ tst \my, #(0x7f << 14)
+ add \mx, r11, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r10, L(\type\()_8tap_h_tbl)
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vdup.16 q13, \bdmax // intermediate_bits
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q13, q13 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.16 {q2}, [\src], \s_strd
+ vld1.16 {q3}, [\sr2], \s_strd
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmull.s16 q1, d4, d0[0]
+ vmlal.s16 q1, d5, d0[1]
+ vmlal.s16 q1, d6, d0[2]
+ vmlal.s16 q1, d7, d0[3]
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vqmovun.s32 d2, q1
+ vrshl.s16 d2, d2, d26 // -intermediate_bits
+ vmin.u16 d2, d2, d30
+ vst1.32 {d2[0]}, [\dst, :32], \d_strd
+ vst1.32 {d2[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q11}, [\sr2], \s_strd
+ vext.8 d18, d16, d17, #2
+ vext.8 d19, d16, d17, #4
+ vext.8 d20, d16, d17, #6
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d21, d22, d23, #6
+ subs \h, \h, #2
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmlal.s16 q2, d20, d0[3]
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d4, q2
+ vmovn.s32 d5, q3
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ vpush {q4-q5}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+81:
+ vld1.16 {q8, q9}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ mov \mx, \w
+
+8:
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q3, d20, d0[0]
+ vmull.s16 q4, d21, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q8, q9, #(2*\i)
+ vext.8 q5, q10, q11, #(2*\i)
+.if \i < 4
+ vmlal.s16 q1, d24, d0[\i]
+ vmlal.s16 q2, d25, d0[\i]
+ vmlal.s16 q3, d10, d0[\i]
+ vmlal.s16 q4, d11, d0[\i]
+.else
+ vmlal.s16 q1, d24, d1[\i-4]
+ vmlal.s16 q2, d25, d1[\i-4]
+ vmlal.s16 q3, d10, d1[\i-4]
+ vmlal.s16 q4, d11, d1[\i-4]
+.endif
+.endr
+ subs \mx, \mx, #8
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q4, q4, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d2, q1
+ vqmovun.s32 d3, q2
+ vqmovun.s32 d4, q3
+ vqmovun.s32 d5, q4
+ vrshl.s16 q1, q1, q13 // -intermediate_bits
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q1, q1, q15
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d2, q1
+ vmovn.s32 d3, q2
+ vmovn.s32 d4, q3
+ vmovn.s32 d5, q4
+ vsub.i16 q1, q1, q13 // PREP_BIAS
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {q1}, [\dst, :128]!
+ vst1.16 {q2}, [\ds2, :128]!
+ ble 9f
+
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [\src]!
+ vld1.16 {q11}, [\sr2]!
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 81b
+ vpop {q4-q5}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+ add \my, r11, \my, lsl #3
+
+.ifc \type, prep
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ vmov.i16 q15, #PREP_BIAS
+.endif
+ adr r10, L(\type\()_8tap_v_tbl)
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ bgt 24f
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vqrshrun_s32 6, q8, d16
+ vmin_u16 d30, d16
+ vst1_32 \d_strd, d16
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d3, d4, d5, d6
+ vqrshrun_s32 6, q8, d16, q9, d17
+ vmin_u16 q15, q8
+ vst1_32 \d_strd, d16, d17
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
+ interleave_1_32 d2, d3, d4, d5, d6
+ interleave_1_32 d6, d7, d16
+216:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
+ interleave_1_32 d16, d17, d18, d19, d20
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
+ vqrshrun_s32 6, q13, d26, q1, d27
+ vmin_u16 q15, q13
+ vst1_32 \d_strd, d26, d27
+ ble 0f
+ cmp \h, #2
+ vmov q1, q3
+ vmov q2, q8
+ vmov q3, q9
+ vmov d16, d20
+ beq 26f
+ b 216b
+26:
+ load_32 \sr2, \src, \s_strd, d17, d18
+ interleave_1_32 d16, d17, d18
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vqrshrun_s32 6, q13, d26
+ vmin_u16 d30, d26
+ vst1_32 \d_strd, d26
+0:
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d2, d3, d4, d5
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmull_vmlal_4 q8, d3, d4, d5, d6
+ vmull_vmlal_4 q9, d4, d5, d6, d7
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
+
+48:
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d23, d24, d25, d26
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25
+ vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
+ ble 0f
+ cmp \h, #2
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov d22, d26
+ beq 46f
+ b 48b
+46:
+ load_reg \sr2, \src, \s_strd, d23, d24
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3
+0:
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9
+ vmull_vmlal_4 q10, d2, d4, d6, d16
+ vmull_vmlal_4 q11, d3, d5, d7, d17
+ vmull_vmlal_4 q12, d4, d6, d16, d18
+ vmull_vmlal_4 q13, d5, d7, d17, d19
+ shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmull_vmlal_4 q1, d6, d16, d18, d20
+ vmull_vmlal_4 q2, d7, d17, d19, d21
+ vmull_vmlal_4 q12, d16, d18, d20, d22
+ vmull_vmlal_4 q13, d17, d19, d21, d23
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q12, q13
+ vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24
+ vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25
+ vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26
+ vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q1, q2
+ vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2
+ vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3
+ vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4
+ vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5
+ shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9
+ ble 9f
+ vmov q5, q9
+ vmov q6, q10
+ vmov q7, q11
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q1
+ vmov q11, q2
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ vpush {q6-q7}
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+
+ load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11
+16:
+ load_16s16 \src, \src, \s_strd, q12, q13
+ subs \h, \h, #1
+ vmull_vmlal_4 q1, d12, d16, d20, d24
+ vmull_vmlal_4 q2, d13, d17, d21, d25
+ vmull_vmlal_4 q3, d14, d18, d22, d26
+ vmull_vmlal_4 q6, d15, d19, d23, d27
+ shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
+ ble 0f
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov q11, q13
+ b 16b
+0:
+ vpop {q6-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+4:
+ add \my, r11, \my, lsl #3
+
+ adr r10, L(\type\()_8tap_hv_tbl)
+ neg r12, r12 // -(6-intermediate_bits)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.ifc \type, put
+ neg r8, lr // -(6+intermeidate_bits)
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vdup.32 q13, r8 // -(6+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d24, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vmin.u16 d4, d4, d30
+ subs \h, \h, #2
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d24
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d24, #4
+ vmov d19, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d24, #4
+ vmov d21, d24
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d24, #4
+ vmull.s16 q3, d16, d2[0]
+ vmlal.s16 q3, d17, d2[1]
+ vmlal.s16 q3, d18, d2[2]
+ vmlal.s16 q3, d19, d2[3]
+ vmlal.s16 q3, d20, d3[0]
+ vmlal.s16 q3, d21, d3[1]
+ vmlal.s16 q3, d22, d3[2]
+ vmlal.s16 q3, d24, d3[3]
+
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d6, q3
+ vmin.u16 d6, d6, d30
+ subs \h, \h, #2
+ vst1.32 {d6[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov q8, q9
+ vmov q9, q10
+ vmov d20, d22
+ vmov d21, d24
+ b 28b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.16 {q11}, [\sr2], \s_strd
+ vld1.16 {q12}, [\src], \s_strd
+ vext.8 d23, d22, d23, #2
+ vext.8 d25, d24, d25, #2
+ vtrn.32 q11, q12
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d23, d0[1]
+ vmlal.s16 q3, d24, d0[2]
+ vmlal.s16 q3, d25, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d17, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d17, d2[0]
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q2, d19, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+ vmull.s16 q3, d18, d2[0]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q3, d24, d2[2]
+ vmlal.s16 q3, d25, d2[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d17, d19
+ vmov q9, q12
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+480: // 4x8, 4x16, 4x32 hv
+ vpush {d13-d15}
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d13, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q7, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d13, d2[0]
+ vmlal.s16 q2, d14, d2[1]
+ vmlal.s16 q2, d15, d2[2]
+ vmlal.s16 q2, d16, d2[3]
+ vmlal.s16 q2, d17, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q2, d19, d3[2]
+ vmlal.s16 q2, d24, d3[3]
+ vmull.s16 q3, d14, d2[0]
+ vmlal.s16 q3, d15, d2[1]
+ vmlal.s16 q3, d16, d2[2]
+ vmlal.s16 q3, d17, d2[3]
+ vmlal.s16 q3, d18, d3[0]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q3, d24, d3[2]
+ vmlal.s16 q3, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d13, d15
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q12
+ b 48b
+0:
+ vpop {d13-d15}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d20, d21, #2
+ vext.8 d25, d20, d21, #4
+ vext.8 d21, d20, d21, #6
+ vmull.s16 q3, d20, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q10
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d16, q2
+ vmovn.i32 d17, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d16, d2[0]
+ vmull.s16 q3, d17, d2[0]
+ vmull.s16 q13, d18, d2[0]
+ vmull.s16 q14, d19, d2[0]
+.ifc \type, put
+ vdup.32 q8, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q13, d20, d2[1]
+ vmlal.s16 q14, d21, d2[1]
+ vmlal.s16 q2, d20, d2[2]
+ vmlal.s16 q3, d21, d2[2]
+ vmlal.s16 q13, d22, d2[2]
+ vmlal.s16 q14, d23, d2[2]
+ vmlal.s16 q2, d22, d2[3]
+ vmlal.s16 q3, d23, d2[3]
+ vmlal.s16 q13, d24, d2[3]
+ vmlal.s16 q14, d25, d2[3]
+.ifc \type, put
+ vdup.16 q9, \bdmax // bitdepth_max
+ vrshl.s32 q2, q2, q8 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q8 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q8 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q8 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q9, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q9 // PREP_BIAS
+ vsub.i16 q3, q3, q9 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+0:
+ pop {r4-r11,pc}
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d8, q2
+ vmovn.i32 d9, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q5, q11
+ vmov q6, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q7, q11
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d8, d2[0]
+ vmull.s16 q3, d9, d2[0]
+ vmull.s16 q13, d10, d2[0]
+ vmull.s16 q14, d11, d2[0]
+.ifc \type, put
+ vdup.32 q4, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d10, d2[1]
+ vmlal.s16 q3, d11, d2[1]
+ vmlal.s16 q13, d12, d2[1]
+ vmlal.s16 q14, d13, d2[1]
+ vmlal.s16 q2, d12, d2[2]
+ vmlal.s16 q3, d13, d2[2]
+ vmlal.s16 q13, d14, d2[2]
+ vmlal.s16 q14, d15, d2[2]
+ vmlal.s16 q2, d14, d2[3]
+ vmlal.s16 q3, d15, d2[3]
+ vmlal.s16 q13, d16, d2[3]
+ vmlal.s16 q14, d17, d2[3]
+ vmlal.s16 q2, d16, d3[0]
+ vmlal.s16 q3, d17, d3[0]
+ vmlal.s16 q13, d18, d3[0]
+ vmlal.s16 q14, d19, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q13, d20, d3[1]
+ vmlal.s16 q14, d21, d3[1]
+ vmlal.s16 q2, d20, d3[2]
+ vmlal.s16 q3, d21, d3[2]
+ vmlal.s16 q13, d22, d3[2]
+ vmlal.s16 q14, d23, d3[2]
+ vmlal.s16 q2, d22, d3[3]
+ vmlal.s16 q3, d23, d3[3]
+ vmlal.s16 q13, d24, d3[3]
+ vmlal.s16 q14, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q4 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q4 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q4 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q4 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q5, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q5 // PREP_BIAS
+ vsub.i16 q3, q3, q5 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.16 {q13, q14}, [\sr2], \s_strd
+ vmull.s16 q2, d26, d0[0]
+ vmull.s16 q3, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d24, d0[\i]
+ vmlal.s16 q3, d25, d0[\i]
+.else
+ vmlal.s16 q2, d24, d1[\i - 4]
+ vmlal.s16 q3, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q12, r12 // -(6-intermediate_bits)
+ vld1.16 {q13, q14}, [\src], \s_strd
+ vrshl.s32 q2, q2, q12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q12 // -(6-intermediate_bits)
+ vmovn.i32 d4, q2
+ vmovn.i32 d5, q3
+
+ vmull.s16 q3, d26, d0[0]
+ vmull.s16 q11, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q3, d24, d0[\i]
+ vmlal.s16 q11, d25, d0[\i]
+.else
+ vmlal.s16 q3, d24, d1[\i - 4]
+ vmlal.s16 q11, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q13, r12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6-intermediate_bits)
+ vrshl.s32 q11, q11, q13 // -(6-intermediate_bits)
+
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q11
+ vmov q11, q2
+ bx lr
+endfunc
+
+function \type\()_bilin_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ vdup.16 q1, \mx
+ vdup.16 q3, \my
+ rsb r9, \mx, #16
+ rsb r10, \my, #16
+ vdup.16 q0, r9
+ vdup.16 q2, r10
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz \bdmax, \bdmax // bitdepth_max
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ cmp \mx, #0
+ sub r9, r9, #24
+ rsb r11, \bdmax, #4 // 4 - intermediate_bits
+ add r12, \bdmax, #4 // 4 + intermediate_bits
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r10, L(\type\()_bilin_h_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.16 q14, \bdmax // intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q14, q14 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.16 {d16}, [\src], \s_strd
+ vld1.16 {d18}, [\sr2], \s_strd
+ vext.8 d17, d16, d16, #2
+ vext.8 d19, d18, d18, #2
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ subs \h, \h, #2
+ vmul.i16 d16, d16, d0
+ vmla.i16 d16, d17, d2
+ vrshl.u16 d16, d16, d30
+ vrshl.u16 d16, d16, d28
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #2
+ vext.8 q11, q10, q10, #2
+ vmov d17, d20
+ vmov d19, d22
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vrshl.u16 q8, q8, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+.else
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.16 {d16, d17, d18}, [\src], \s_strd
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vext.8 q9, q8, q9, #2
+ vext.8 q11, q10, q11, #2
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q11, q1
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q10, q10, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q8, q8, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ vpush {q4-q7}
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+161:
+ vld1.16 {q4}, [\src]!
+ vld1.16 {q9}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.16 {q5, q6}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ vext.8 q7, q4, q5, #2
+ vext.8 q8, q5, q6, #2
+ vext.8 q12, q9, q10, #2
+ vext.8 q13, q10, q11, #2
+ vmul.i16 q4, q4, q0
+ vmla.i16 q4, q7, q1
+ vmul.i16 q5, q5, q0
+ vmla.i16 q5, q8, q1
+ vmul.i16 q9, q9, q0
+ vmla.i16 q9, q12, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q4, q4, q15
+ vrshl.u16 q5, q5, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ subs \mx, \mx, #16
+.ifc \type, put
+ vrshl.u16 q4, q4, q14
+ vrshl.u16 q5, q5, q14
+ vrshl.u16 q9, q9, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q4, q4, q14
+ vsub.i16 q5, q5, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q4, q5}, [\dst, :128]!
+ vst1.16 {q9, q10}, [\ds2, :128]!
+ ble 9f
+
+ vmov q4, q6
+ vmov q9, q11
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ vdup.16 q15, r11 // 4 - intermediate_bits
+.endif
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vmov.i16 q14, #PREP_BIAS
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.32 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmul.i16 d16, d16, d4
+ vmla.i16 d16, d17, d6
+ vrshr.u16 d16, d16, #4
+ vst1.32 {d16[0]}, [\dst, :32]
+ vst1.32 {d16[1]}, [\ds2, :32]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vld1.32 {d19[]}, [\sr2], \s_strd
+ vld1.32 {d20[]}, [\src], \s_strd
+ subs \h, \h, #4
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vext.8 d18, d18, d19, #4
+ vext.8 d19, d19, d20, #4
+ vswp d17, d18
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ cmp \h, #2
+ vrshr.u16 q8, q8, #4
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ vst1.32 {d17[0]}, [\dst, :32], \d_strd
+ vst1.32 {d17[1]}, [\ds2, :32], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {d16}, [\src], \s_strd
+4:
+ vld1.16 {d17}, [\sr2], \s_strd
+ vld1.16 {d19}, [\src], \s_strd
+ vmov d18, d17
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {q8}, [\src], \s_strd
+8:
+ vld1.16 {q9}, [\sr2], \s_strd
+ vld1.16 {q10}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+ ble 0f
+ vmov q8, q10
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q8, q9}, [\src], \s_strd
+2:
+ vld1.16 {q10, q11}, [\sr2], \s_strd
+ vld1.16 {q12, q13}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q10, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q11, q3
+ vmul.i16 q10, q10, q2
+ vmla.i16 q10, q12, q3
+ vmul.i16 q11, q11, q2
+ vmla.i16 q11, q13, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vrshr.u16 q10, q10, #4
+ vrshr.u16 q11, q11, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ vrshl.u16 q11, q11, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vsub.i16 q11, q11, q14
+.endif
+ vst1.16 {q8, q9}, [\dst, :128], \d_strd
+ vst1.16 {q10, q11}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q12
+ vmov q9, q13
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ adr r10, L(\type\()_bilin_hv_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.32 q14, r12 // 4 + intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s32 q14, q14 // -(4+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+ vext.8 d16, d16, d16, #4
+
+2:
+ vld1.16 {d20}, [\sr2], \s_strd
+ vld1.16 {d22}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vext.8 d23, d22, d22, #2
+ vtrn.32 d20, d22
+ vtrn.32 d21, d23
+ vmul.i16 d18, d20, d0
+ vmla.i16 d18, d21, d2
+ vrshl.u16 d18, d18, d30
+
+ vext.8 d16, d16, d18, #4
+
+ vmull.u16 q8, d16, d4
+ vmlal.u16 q8, d18, d6
+ vrshl.u32 q8, q8, q14
+ vmovn.i32 d16, q8
+ subs \h, \h, #2
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q10}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+
+4:
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vext.8 d23, d22, d23, #2
+ vswp d21, d22
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vrshl.u16 q9, q9, q15
+
+ vmull.u16 q10, d16, d4
+ vmlal.u16 q10, d18, d6
+ vmull.u16 q11, d18, d4
+ vmlal.u16 q11, d19, d6
+.ifc \type, put
+ vrshl.u32 q10, q10, q14
+ vrshl.u32 q11, q11, q14
+ vmovn.i32 d20, q10
+ vmovn.i32 d21, q11
+.else
+ vrshrn.i32 d20, q10, #4
+ vrshrn.i32 d21, q11, #4
+ vsub.i16 q10, q10, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20, d21, d22}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vmul.i16 q8, q10, q0
+ vmla.i16 q8, q11, q1
+ vrshl.u16 q8, q8, q15
+
+2:
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vld1.16 {d24, d25, d26}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vext.8 q13, q12, q13, #2
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vmul.i16 q10, q12, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+
+ vmull.u16 q11, d16, d4
+ vmlal.u16 q11, d18, d6
+ vmull.u16 q12, d17, d4
+ vmlal.u16 q12, d19, d6
+ vmull.u16 q8, d18, d4
+ vmlal.u16 q8, d20, d6
+ vmull.u16 q9, d19, d4
+ vmlal.u16 q9, d21, d6
+.ifc \type, put
+ vrshl.u32 q11, q11, q14
+ vrshl.u32 q12, q12, q14
+ vrshl.u32 q8, q8, q14
+ vrshl.u32 q9, q9, q14
+ vmovn.i32 d22, q11
+ vmovn.i32 d23, q12
+ vmovn.i32 d16, q8
+ vmovn.i32 d17, q9
+.else
+ vrshrn.i32 d22, q11, #4
+ vrshrn.i32 d23, q12, #4
+ vrshrn.i32 d16, q8, #4
+ vrshrn.i32 d17, q9, #4
+ vsub.i16 q11, q11, q14
+ vsub.i16 q8, q8, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {q11}, [\dst, :128], \d_strd
+ vst1.16 {q8}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q6,q7}, [r2], r3
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d2, r5, r7 // filter 1
+ vmovl.s8 q0, d0 // filter 0
+ vext.8 q3, q6, q7, #2*1 // filter 1 pixels
+ vmovl.s8 q1, d2 // filter 1
+
+ vmull.s16 q4, d12, d0 // filter 0 output (0-3)
+ vmull.s16 q5, d13, d1 // filter 0 output (4-7)
+
+ load_filter_ptr r5 // filter 2
+
+ vmull.s16 q2, d6, d2 // filter 1 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 1 output (4-7)
+
+ load_filter_coef d0, r5, r7 // filter 2
+
+ vpadd.i32 d8, d8, d9 // half pixel 0 (2x32)
+ vpadd.i32 d9, d10, d11 // half pixel 0 (2x32)
+
+ load_filter_ptr r5 // filter 3
+
+ vpadd.i32 d4, d4, d5 // half pixel 1 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 1 (2x32)
+
+ vmovl.s8 q0, d0 // filter 2
+ vext.8 q3, q6, q7, #2*2 // filter 2 pixels
+
+ vpadd.i32 d8, d8, d9 // pixel 0 (2x32)
+ vpadd.i32 d9, d4, d5 // pixel 1 (2x32)
+
+ load_filter_coef d2, r5, r7 // filter 3
+
+ vmull.s16 q2, d6, d0 // filter 2 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 2 output (4-7)
+
+ load_filter_ptr r5 // filter 4
+
+ vpadd.i32 d8, d8, d9 // pixel 0,1
+
+ vpadd.i32 d9, d4, d5 // half pixel 2 (2x32)
+ vpadd.i32 d10, d6, d7 // half pixel 2 (2x32)
+
+ vmovl.s8 q1, d2 // filter 3
+ vext.8 q3, q6, q7, #2*3 // filter 3 pixels
+
+ load_filter_coef d0, r5, r7 // filter 4
+
+ vpadd.i32 d9, d9, d10 // pixel 2 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 3 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 3 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 4
+ load_filter_ptr r5 // filter 5
+
+ vpadd.i32 d10, d4, d5 // half pixel 3 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 3 (2x32)
+
+ vext.8 q3, q6, q7, #2*4 // filter 4 pixels
+ load_filter_coef d2, r5, r7 // filter 5
+
+ vpadd.i32 d10, d10, d11 // pixel 3 (2x32)
+
+ vpadd.i32 d9, d9, d10 // pixel 2,3
+
+ vmull.s16 q2, d6, d0 // filter 4 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 4 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 5
+ load_filter_ptr r5 // filter 6
+
+ vpadd.i32 d10, d4, d5 // half pixel 4 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 4 (2x32)
+
+ vext.8 q3, q6, q7, #2*5 // filter 5 pixels
+ load_filter_coef d0, r5, r7 // filter 6
+
+ vpadd.i32 d10, d10, d11 // pixel 4 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 5 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 5 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 6
+ load_filter_ptr r5 // filter 7
+
+ vpadd.i32 d4, d4, d5 // half pixel 5 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 5 (2x32)
+
+ vext.8 q3, q6, q7, #2*6 // filter 6 pixels
+ load_filter_coef d2, r5, r7 // filter 7
+
+ vpadd.i32 d11, d4, d5 // pixel 5 (2x32)
+
+ vmull.s16 q2, d6, d0 // filter 6 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 6 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 7
+
+ vpadd.i32 d10, d10, d11 // pixel 4,5
+
+ vpadd.i32 d4, d4, d5 // half pixel 6 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 6 (2x32)
+
+ vext.8 q3, q6, q7, #2*7 // filter 7 pixels
+
+ vpadd.i32 d11, d4, d5 // pixel 6 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 7 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 7 output (4-7)
+
+ vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
+
+ vpadd.i32 d4, d4, d5 // half pixel 7 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 7 (2x32)
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i32 d4, d4, d5 // pixel 7 (2x32)
+
+ add r5, r5, r8
+
+ vpadd.i32 d11, d11, d4 // pixel 6,7
+
+ vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits)
+ vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits)
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ sub sp, sp, #8
+
+ clz r7, r7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub r7, r7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg r8, r8 // -(7 + intermediate_bits)
+.endif
+ str r7, [sp] // spill -(7 - intermediate_bits) on stack
+.ifb \t
+ str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack
+.endif
+
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #6
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vmovn.i32 d16, q4
+ vmovn.i32 d17, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d18, q4
+ vmovn.i32 d19, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d20, q4
+ vmovn.i32 d21, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d22, q4
+ vmovn.i32 d23, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d24, q4
+ vmovn.i32 d25, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d26, q4
+ vmovn.i32 d27, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d28, q4
+ vmovn.i32 d29, q5
+
+1:
+ bl warp_filter_horz_neon
+ vmovn.i32 d30, q4
+ vmovn.i32 d31, q5
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ ldr lr, [sp, #4] // -(7 + intermediate_bits)
+ ldr r12, [sp, #120] // bitdepth_max
+ vdup.32 q2, lr // -(7 + intermediate_bits)
+ vdup.16 q3, r12 // bitdepth_max
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+.ifb \t
+ vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits)
+ vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits)
+.else
+ vrshrn.s32 d0, q0, #7
+ vrshrn.s32 d1, q1, #7
+ vmov.i16 q3, #PREP_BIAS
+.endif
+ vmov q10, q11
+.ifb \t
+ vqmovun.s32 d0, q0
+ vqmovun.s32 d1, q1
+.else
+ vsub.i16 q0, q0, q3 // PREP_BIAS
+.endif
+ vmov q11, q12
+ vmov q12, q13
+.ifb \t
+ vmin.u16 q0, q0, q3 // bitdepth_max
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+ vst1.16 {q0}, [r0, :128], r1
+
+ add r6, r6, r4
+ bgt 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.16 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4, lsl #1 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.16 {q0, q1}, [lr]!
+ subs r3, r3, #32
+ vld1.16 {q2, q3}, [lr]!
+.if \need_left
+ vst1.16 {q0, q1}, [r12]!
+ vst1.16 {q2, q3}, [r12]!
+.else
+ vst1.16 {q0, q1}, [r12, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2, lsl #1 // in + center_w
+ sub r3, r3, #2 // in + center_w - 1
+ add r12, r6, r4, lsl #1 // dst + left_ext
+ vld1.16 {d0[], d1[]}, [r3]
+ add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w
+ mov r3, r11
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r8, :128]!
+ mov r3, r10
+ vld1.16 {q2, q3}, [r8, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ mov r3, r5
+ vld1.16 {q2, q3}, [r0, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
new file mode 100644
index 0000000000..b06e109dda
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits, align=4
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src, :128]
+.else
+ vld1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src]
+.else
+ vld1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src, :128]
+.else
+ vst1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src]
+.else
+ vst1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshr.u16 \d0, \s0, \s3
+.else
+ vshr.u16 \d1, \s1, \s4
+.if \n == 16
+ vshr.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vadd.i16 \d0, \s0, \s3
+.else
+ vadd.i16 \d1, \s1, \s4
+.if \n == 16
+ vadd.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vsub.i16 \d0, \s0, \s3
+.else
+ vsub.i16 \d1, \s1, \s4
+.if \n == 16
+ vsub.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vand \d0, \s0, \s3
+.else
+ vand \d1, \s1, \s4
+.if \n == 16
+ vand \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vcge.u16 \d0, \s0, \s3
+.else
+ vcge.u16 \d1, \s1, \s4
+.if \n == 16
+ vcge.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vrhadd.u16 \d0, \s0, \s3
+.else
+ vrhadd.u16 \d1, \s1, \s4
+.if \n == 16
+ vrhadd.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshl.s16 \d0, \s0, \s3
+.else
+ vshl.s16 \d1, \s1, \s4
+.if \n == 16
+ vshl.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vqdmulh.s16 \d0, \s0, \s3
+.else
+ vqdmulh.s16 \d1, \s1, \s4
+.if \n == 16
+ vqdmulh.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+ push {r4-r10,lr}
+ sub sp, sp, #48
+ add r8, r0, #RNG
+
+ vld1_align_n d0, q0, q1, r1, \n // cdf
+ vld1.16 {d16[]}, [r8, :16] // rng
+ movrel_local r9, coeffs, 30
+ vmov.i16 d30, #0x7f00 // 0x7f00
+ sub r9, r9, r2, lsl #1
+ vmvn.i16 q14, #0x3f // 0xffc0
+ add r8, sp, #14
+ vand d22, d16, d30 // rng & 0x7f00
+ vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng
+ vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+ vmov d23, d22
+.endif
+
+ vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret)
+ vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r8, r0, #DIF + 2
+
+ vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+ vmov.i16 d17, #0
+.endif
+ vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ add r9, sp, #16
+ vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16)
+ movrel_local r8, bits
+ vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access
+
+ vmov d21, d20
+ vld1_align_n q12, q12, q13, r8, \n
+.if \n == 16
+ vmov q11, q10
+.endif
+
+ vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v
+
+ vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+ vadd.i16 q10, q10, q11
+.endif
+ vadd.i16 d20, d20, d21 // Aggregate mask bits
+ ldr r4, [r0, #ALLOW_UPDATE_CDF]
+ vpadd.i16 d20, d20, d20
+ lsl r10, r2, #1
+ vpadd.i16 d20, d20, d20
+ vmov.u16 r3, d20[0]
+ cmp r4, #0
+ rbit r3, r3
+ clz lr, r3 // ret
+
+ beq L(renorm)
+ // update_cdf
+ ldrh r3, [r1, r10] // count = cdf[n_symbols]
+ vmov.i8 q10, #0xff
+.if \n == 16
+ mov r4, #-5
+.else
+ mvn r12, r2
+ mov r4, #-4
+ cmn r12, #3 // set C if n_symbols <= 2
+.endif
+ vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub r4, r4, r3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr r12, r3, #4 // count >> 4
+ sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+ vdup.16 d20, r4 // -rate
+.else
+ vdup.16 q10, r4 // -rate
+.endif
+
+ sub r3, r3, r3, lsr #5 // count - (count == 32)
+ vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0)
+ vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+ add r3, r3, #1 // count + (count < 32)
+ vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate
+ vst1_align_n d0, q0, q1, r1, \n
+ strh r3, [r1, r10]
+.endm
+
+ decode_update 4
+
+L(renorm):
+ add r8, sp, #16
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+L(renorm2):
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 8 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 8 - c
+
+9:
+ str r6, [r0, #CNT]
+ str r7, [r0, #DIF]
+
+ mov r0, lr
+ add sp, sp, #48
+
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ push {r4-r10,lr}
+ vld1.16 {d0}, [r1, :64] // cdf
+ add r4, r0, #RNG
+ vmov.i16 d31, #0x7f00 // 0x7f00
+ movrel_local r5, coeffs, 30-2*3
+ vmvn.i16 d30, #0x3f // 0xffc0
+ ldrh r9, [r1, #6] // count = cdf[n_symbols]
+ vld1.16 {d1[]}, [r4, :16] // rng
+ movrel_local r4, bits
+ vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret)
+ add r5, r0, #DIF + 2
+ vld1.16 {q8}, [r4, :128]
+ mov r2, #-24
+ vand d20, d0, d30 // cdf & 0xffc0
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ vmov d3, d2
+1:
+ vand d23, d1, d31 // rng & 0x7f00
+ vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r12, sp, #14
+ vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ vmov.i16 d7, #0
+ vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng
+ add r12, sp, #16
+ vcge.u16 q2, q1, q3 // c >= v
+ vst1.16 {q3}, [r12] // store v values to allow indexed access
+ vand q9, q2, q8 // One bit per halfword set in the mask
+
+ vadd.i16 d18, d18, d19 // Aggregate mask bits
+ vpadd.i16 d18, d18, d18
+ vpadd.i16 d18, d18, d18
+ vmov.u16 r3, d18[0]
+ cmp r10, #0
+ add r2, r2, #5
+ rbit r3, r3
+ add r8, sp, #16
+ clz lr, r3 // ret
+
+ beq 2f
+ // update_cdf
+ vmov.i8 d22, #0xff
+ mov r4, #-5
+ vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768
+ sub r4, r4, r9, lsr #4 // -((count >> 4) + 5)
+ vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i])
+ vdup.16 d18, r4 // -rate
+
+ sub r9, r9, r9, lsr #5 // count - (count == 32)
+ vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0)
+ vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate
+ add r9, r9, #1 // count + (count < 32)
+ vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate
+ vst1.16 {d0}, [r1, :64]
+ vand d20, d0, d30 // cdf & 0xffc0
+ strh r9, [r1, #6]
+
+2:
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ vdup.16 d1, r4
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 40 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 40 - c
+
+9:
+ lsl lr, lr, #1
+ sub lr, lr, #5
+ lsr r12, r7, #16
+ adds r2, r2, lr // carry = tok_br < 3 || tok == 15
+ vdup.16 q1, r12
+ bcc 1b // loop if !carry
+ add r2, r2, #30
+ str r6, [r0, #CNT]
+ add sp, sp, #48
+ str r7, [r0, #DIF]
+ lsr r0, r2, #1
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ bic r4, r5, #0xff // r &= 0xff00
+ add r4, r4, #8
+ mov r2, #0
+ subs r8, r7, r4, lsl #15 // dif - vw
+ lsr r4, r4, #1 // v
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ bic r1, r1, #0x3f // f &= ~63
+ mul r4, r4, r1
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ push {r4-r10,lr}
+ ldr r9, [r1] // cdf[0-1]
+ ldr r5, [r0, #RNG]
+ movw lr, #0xffc0
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ and r2, r9, lr // f &= ~63
+ mul r4, r4, r2
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ cmp r10, #0
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+
+ beq L(renorm2)
+
+ lsr r2, r9, #16 // count = cdf[1]
+ uxth r9, r9 // cdf[0]
+
+ sub r3, r2, r2, lsr #5 // count - (count >= 32)
+ lsr r2, r2, #4 // count >> 4
+ add r10, r3, #1 // count + (count < 32)
+ add r2, r2, #4 // rate = (count >> 4) | 4
+
+ sub r9, r9, lr // cdf[0] -= bit
+ sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub r9, r9, r3 // cdf[0]
+
+ strh r9, [r1]
+ strh r10, [r1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/refmvs.S b/third_party/dav1d/src/arm/32/refmvs.S
new file mode 100644
index 0000000000..7f31db11eb
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/refmvs.S
@@ -0,0 +1,303 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ push {r4, lr}
+ vld1.8 {q3}, [r1]
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr lr, L(splat_tbl)
+ sub r3, r3, #26
+ vext.8 q2, q3, q3, #12
+ ldr r3, [lr, r3, lsl #2]
+ add r2, r2, r2, lsl #1
+ vext.8 q0, q2, q3, #4
+ add r3, lr, r3
+ vext.8 q1, q2, q3, #8
+ lsl r2, r2, #2
+ vext.8 q2, q2, q3, #12
+ vmov q3, q0
+1:
+ ldr r1, [r0], #4
+ subs r4, r4, #1
+ add r1, r1, r2
+ bx r3
+
+ .align 2
+L(splat_tbl):
+ .word 320f - L(splat_tbl) + CONFIG_THUMB
+ .word 160f - L(splat_tbl) + CONFIG_THUMB
+ .word 80f - L(splat_tbl) + CONFIG_THUMB
+ .word 40f - L(splat_tbl) + CONFIG_THUMB
+ .word 20f - L(splat_tbl) + CONFIG_THUMB
+ .word 10f - L(splat_tbl) + CONFIG_THUMB
+
+10:
+ vst1.8 {d0}, [r1]
+ vstr s2, [r1, #8]
+ bgt 1b
+ pop {r4, pc}
+20:
+ vst1.8 {q0}, [r1]
+ vstr d2, [r1, #16]
+ bgt 1b
+ pop {r4, pc}
+40:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+320:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+160:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+80:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+endfunc
+
+const mv_tbls, align=4
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+ .byte 1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+// refmvs_block **rr, const uint8_t *ref_sign,
+// int col_end8, int row_end8,
+// int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+
+ vmov.i8 d30, #0
+ vld1.8 {d31}, [r3]
+ adr r8, L(save_tmvs_tbl)
+ movrel_local lr, mask_mult
+ movrel_local r12, mv_tbls
+ vld1.8 {d29}, [lr]
+ vext.8 d31, d30, d31, #7 // [0, ref_sign]
+ mov r3, #5
+ mul r1, r1, r3 // stride *= 5
+ sub r5, r5, r7 // h = row_end8 - row_start8
+ lsl r7, r7, #1 // row_start8 <<= 1
+1:
+ mov r3, #5
+ mov r11, #12*2
+ and r9, r7, #30 // (y & 15) * 2
+ ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2]
+ add r9, r9, #12 // &b[... + 1]
+ mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
+ mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
+
+ mla r3, r6, r3, r0 // &rp[x]
+
+ push {r2,r4,r6}
+
+2:
+ ldrb r11, [r9, #10] // cand_b->bs
+ add lr, r9, #8
+ vld1.8 {d0, d1}, [r9] // cand_b->mv
+ add r11, r8, r11, lsl #3
+ vld1.16 {d2[]}, [lr] // cand_b->ref
+ ldrh lr, [r11] // bw8
+ mov r2, r8
+ add r9, r9, lr, lsl #1 // cand_b += bw8*2
+ cmp r9, r10
+ vmov d4, d0
+ bge 3f
+
+ ldrb r2, [r9, #10] // cand_b->bs
+ add lr, r9, #8
+ vld1.8 {d6, d7}, [r9] // cand_b->mv
+ add r2, r8, r2, lsl #3
+ vld1.16 {d2[1]}, [lr] // cand_b->ref
+ ldrh lr, [r2] // bw8
+ add r9, r9, lr, lsl #1 // cand_b += bw8*2
+ vmov d5, d6
+
+3:
+ vabs.s16 q2, q2 // abs(mv[].xy)
+ vtbl.8 d2, {d31}, d2 // ref_sign[ref]
+ vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12
+ vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2}
+ vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096
+ vmovn.i32 d4, q2 // abs() condition to 16 bit
+ vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1]
+ vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0]
+ vmov.u16 r4, d2[0] // Extract case for first block
+ vmov.u16 r6, d2[1]
+ ldr r11, [r11, #4] // Fetch jump table entry
+ ldr r2, [r2, #4]
+ add r4, r12, r4, lsl #4
+ add r6, r12, r6, lsl #4
+ vld1.8 {d2, d3}, [r4] // Load permutation table base on case
+ vld1.8 {d4, d5}, [r6]
+ add r11, r8, r11 // Find jump table target
+ add r2, r8, r2
+ vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block
+ vtbl.8 d17, {d0, d1}, d3
+ vtbl.8 d18, {d6, d7}, d4
+ vtbl.8 d19, {d6, d7}, d5
+ vmov q0, q8
+
+ // q1 follows on q0 (q8), with another 3 full repetitions of the pattern.
+ vext.8 q1, q8, q8, #1
+ vext.8 q10, q9, q9, #1
+ // q2 ends with 3 complete repetitions of the pattern.
+ vext.8 q2, q8, q1, #4
+ vext.8 q11, q9, q10, #4
+
+ blx r11
+ bge 4f // if (cand_b >= end)
+ vmov q0, q9
+ vmov q1, q10
+ vmov q2, q11
+ cmp r9, r10
+ blx r2
+ blt 2b // if (cand_b < end)
+
+4:
+ pop {r2,r4,r6}
+
+ subs r5, r5, #1 // h--
+ add r7, r7, #2 // y += 2
+ add r0, r0, r1 // rp += stride
+ bgt 1b
+
+ pop {r4-r11,pc}
+
+ .align 2
+L(save_tmvs_tbl):
+ .word 16 * 12
+ .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 16 * 12
+ .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 8 * 12
+ .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 4 * 12
+ .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 2 * 12
+ .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+ .word 1 * 12
+ .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB
+
+10:
+ add r4, r3, #4
+ vst1.32 {d0[0]}, [r3]
+ vst1.8 {d0[4]}, [r4]
+ add r3, r3, #5
+ bx lr
+20:
+ add r4, r3, #8
+ vst1.8 {d0}, [r3]
+ vst1.16 {d1[0]}, [r4]
+ add r3, r3, #2*5
+ bx lr
+40:
+ add r4, r3, #16
+ vst1.8 {q0}, [r3]
+ vst1.32 {d2[0]}, [r4]
+ add r3, r3, #4*5
+ bx lr
+80:
+ add r4, r3, #(8*5-16)
+ // This writes 6 full entries plus 2 extra bytes
+ vst1.8 {q0, q1}, [r3]
+ // Write the last few, overlapping with the first write.
+ vst1.8 {q2}, [r4]
+ add r3, r3, #8*5
+ bx lr
+160:
+ add r4, r3, #6*5
+ add r6, r3, #12*5
+ // This writes 6 full entries plus 2 extra bytes
+ vst1.8 {q0, q1}, [r3]
+ // Write another 6 full entries, slightly overlapping with the first set
+ vst1.8 {q0, q1}, [r4]
+ add r4, r3, #(16*5-16)
+ // Write 8 bytes (one full entry) after the first 12
+ vst1.8 {d0}, [r6]
+ // Write the last 3 entries
+ vst1.8 {q2}, [r4]
+ add r3, r3, #16*5
+ bx lr
+endfunc
diff --git a/third_party/dav1d/src/arm/32/util.S b/third_party/dav1d/src/arm/32/util.S
new file mode 100644
index 0000000000..c3710d3767
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/util.S
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_32_UTIL_S
+#define DAV1D_SRC_ARM_32_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel_local rd, val, offset=0
+#if defined(PIC)
+ ldr \rd, 90001f
+ b 90002f
+90001:
+ .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
+90002:
+ add \rd, \rd, pc
+#else
+ movw \rd, #:lower16:\val+\offset
+ movt \rd, #:upper16:\val+\offset
+#endif
+.endm
+
+.macro movrel rd, val, offset=0
+#if defined(PIC) && defined(__APPLE__)
+ ldr \rd, 1f
+ b 2f
+1:
+ .word 3f - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+ ldr \rd, [pc, \rd]
+.if \offset < 0
+ sub \rd, \rd, #-(\offset)
+.elseif \offset > 0
+ add \rd, \rd, #\offset
+.endif
+ .non_lazy_symbol_pointer
+3:
+ .indirect_symbol \val
+ .word 0
+ .text
+#else
+ movrel_local \rd, \val, \offset
+#endif
+.endm
+
+// This macro clobbers r7 (and r12 on windows) and stores data at the
+// bottom of the stack; sp is the start of the space allocated that
+// the caller can use.
+.macro sub_sp_align space
+#if CONFIG_THUMB
+ mov r7, sp
+ and r7, r7, #15
+#else
+ and r7, sp, #15
+#endif
+ sub sp, sp, r7
+ // Now the stack is aligned, store the amount of adjustment back
+ // on the stack, as we don't want to waste a register as frame
+ // pointer.
+ str r7, [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub r7, sp, #4096
+ ldr r12, [r7]
+ sub r7, r7, #(\space - 4096)
+ mov sp, r7
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+ add sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ add sp, sp, #(\space)%4096
+.endif
+ ldr r7, [sp], #16
+ // Add back the original stack adjustment
+ add sp, sp, r7
+.endm
+
+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \q0, \q2
+ vtrn.32 \q1, \q3
+
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+ vswp \d0, \d4
+ vswp \d1, \d5
+ vswp \d2, \d6
+ vswp \d3, \d7
+
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
+.macro transpose_4x8b q0, q1, r0, r1, r2, r3
+ vtrn.16 \q0, \q1
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vswp \r1, \r4 // vtrn.64 \q0, \q2
+ vswp \r3, \r6 // vtrn.64 \q1, \q3
+
+ vtrn.32 \q0, \q1
+ vtrn.32 \q2, \q3
+.endm
+
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+ vtrn.32 \q0, \q1
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+#endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/third_party/dav1d/src/arm/64/cdef.S b/third_party/dav1d/src/arm/64/cdef.S
new file mode 100644
index 0000000000..32b258aba8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef.S
@@ -0,0 +1,520 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #2
+ sub \s2, \s2, #2
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr s1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr s3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr \rn\()1, [\s2]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().s}[0], [\src], \incr
+.else
+ ld1 {\dst\().8b}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func w, stride, rn, rw
+function cdef_padding\w\()_8bpc_neon, export=1
+ cmp w7, #0xf // fully edged
+ b.eq cdef_padding\w\()_edged_8bpc_neon
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.h}[0], [x3], #2
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr h1, [x1, #\w]
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1
+endfunc
+.endm
+
+padding_func 8, 16, d, q
+padding_func 4, 8, s, d
+
+// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg
+function cdef_padding\w\()_edged_8bpc_neon, export=1
+ sub x4, x4, #2
+ sub x5, x5, #2
+ sub x0, x0, #(2*\stride+2)
+
+.if \w == 4
+ ldr d0, [x4]
+ ldr d1, [x4, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x4, x2
+ ldr d0, [x4]
+ ldr s1, [x4, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+ add x0, x0, #2*\stride
+.endif
+
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ str h0, [x0]
+ stur \reg\()1, [x0, #2]
+ str h2, [x0, #2+\w]
+ add x0, x0, #\stride
+ b.gt 0b
+
+.if \w == 4
+ ldr d0, [x5]
+ ldr d1, [x5, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x5, x2
+ ldr d0, [x5]
+ ldr s1, [x5, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+.endif
+ ret
+endfunc
+.endm
+
+padding_func_edged 8, 16, d
+padding_func_edged 4, 8, s
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().d}[0], [x6] // p0
+ add x6, x6, #16 // += stride
+ ld1 {\d2\().d}[0], [x9] // p1
+ add x9, x9, #16 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p0
+.else
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().s}[0], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[0], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[1], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[1], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[2], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[2], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[3], [x6] // p0
+ ld1 {\d2\().s}[3], [x9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v3.16b, v3.16b, \s1\().16b
+ umax v4.16b, v4.16b, \s1\().16b
+ umin v3.16b, v3.16b, \s2\().16b
+ umax v4.16b, v4.16b, \s2\().16b
+.endif
+ uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
+ uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
+ ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
+ ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
+ uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ cmhi v18.16b, v0.16b, \s1\().16b // px > p0
+ cmhi v22.16b, v0.16b, \s2\().16b // px > p1
+ umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
+ umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
+ dup v19.16b, \tap // taps[k]
+ neg v16.16b, v17.16b // -imin()
+ neg v20.16b, v21.16b // -imin()
+ bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
+ bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
+ mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
+ mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint8_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_8bpc_neon
+.if \pri
+ movrel x8, pri_taps
+ and w9, w3, #1
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.8b, #7
+ dup v28.8b, w6 // damping
+
+.if \pri
+ dup v25.16b, w3 // threshold
+.endif
+.if \sec
+ dup v27.16b, w4 // threshold
+.endif
+ trn1 v24.8b, v25.8b, v27.8b
+ clz v24.8b, v24.8b // clz(threshold)
+ sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
+ uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
+ neg v24.8b, v24.8b // -shift
+.if \sec
+ dup v26.16b, v24.b[1]
+.endif
+.if \pri
+ dup v24.16b, v24.b[0]
+.endif
+
+1:
+.if \w == 8
+ add x12, x2, #16
+ ld1 {v0.d}[0], [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.else
+ add x12, x2, #1*8
+ add x13, x2, #2*8
+ add x14, x2, #3*8
+ ld1 {v0.s}[0], [x2] // px
+ ld1 {v0.s}[1], [x12] // px
+ ld1 {v0.s}[2], [x13] // px
+ ld1 {v0.s}[3], [x14] // px
+.endif
+
+ // We need 9-bits or two 8-bit accululators to fit the sum.
+ // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
+ // Start sum at -1 instead of 0 to help handle rounding later.
+ movi v1.16b, #255 // sum
+ movi v2.16b, #0 // sum
+.if \min
+ mov v3.16b, v0.16b // min
+ mov v4.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px_8 v5, v6, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px_8 v28, v29, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px_8 v5, v6, \w
+
+ handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
+
+ handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ // Perform halving adds since the value won't fit otherwise.
+ // To handle the offset for negative values, use both halving w/ and w/o rounding.
+ srhadd v5.16b, v1.16b, v2.16b // sum >> 1
+ shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
+ cmlt v1.16b, v5.16b, #0 // sum < 0
+ bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
+
+ srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
+
+ usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
+.if \min
+ umin v0.16b, v0.16b, v4.16b
+ umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ st1 {v0.d}[0], [x0], x1
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+ st1 {v0.d}[1], [x0], x1
+.else
+ st1 {v0.s}[0], [x0], x1
+ add x2, x2, #4*8 // tmp += 4*tmp_stride
+ st1 {v0.s}[1], [x0], x1
+ subs w7, w7, #4 // h -= 4
+ st1 {v0.s}[2], [x0], x1
+ st1 {v0.s}[3], [x0], x1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/64/cdef16.S b/third_party/dav1d/src/arm/64/cdef16.S
new file mode 100644
index 0000000000..ecf864a26d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef16.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #4
+ sub \s2, \s2, #4
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr d1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr d3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr \reg\()1, [\s2]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr_16 dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().4h}, [\src], \incr
+.else
+ ld1 {\dst\().8h}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_16 w, stride, reg
+function cdef_padding\w\()_16bpc_neon, export=1
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.s}[0], [x3], #4
+ ldr s2, [x1, #2*\w]
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.s}[0], [x3], #4
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr s1, [x1, #2*\w]
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bot_16 x5, x9, \w, \stride, \reg, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q
+padding_func_16 4, 8, d
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/64/cdef_tmpl.S b/third_party/dav1d/src/arm/64/cdef_tmpl.S
new file mode 100644
index 0000000000..d35d7a09ba
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().8h}, [x6] // p0
+ ld1 {\d2\().8h}, [x9] // p1
+.else
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().4h}, [x6] // p0
+ add x6, x6, #2*8 // += stride
+ ld1 {\d2\().4h}, [x9] // p1
+ add x9, x9, #2*8 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v2.8h, v2.8h, \s1\().8h
+ smax v3.8h, v3.8h, \s1\().8h
+ umin v2.8h, v2.8h, \s2\().8h
+ smax v3.8h, v3.8h, \s2\().8h
+.endif
+ uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
+ uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
+ ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
+ ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
+ uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
+ sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
+ neg v16.8h, v17.8h // -clip
+ neg v20.8h, v21.8h // -clip
+ smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
+ smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
+ dup v19.8h, \tap // taps[k]
+ smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
+ smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
+ mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
+ mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ ldr w8, [sp] // edges
+ cmp w8, #0xf
+ b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
+.endif
+.if \pri
+.if \bpc == 16
+ ldr w9, [sp, #8] // bitdepth_max
+ clz w9, w9
+ sub w9, w9, #24 // -bitdepth_min_8
+ neg w9, w9 // bitdepth_min_8
+.endif
+ movrel x8, pri_taps
+.if \bpc == 16
+ lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
+ and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and w9, w3, #1
+.endif
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.4h, #15
+ dup v28.4h, w6 // damping
+
+.if \pri
+ dup v25.8h, w3 // threshold
+.endif
+.if \sec
+ dup v27.8h, w4 // threshold
+.endif
+ trn1 v24.4h, v25.4h, v27.4h
+ clz v24.4h, v24.4h // clz(threshold)
+ sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
+ uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
+ neg v24.4h, v24.4h // -shift
+.if \sec
+ dup v26.8h, v24.h[1]
+.endif
+.if \pri
+ dup v24.8h, v24.h[0]
+.endif
+
+1:
+.if \w == 8
+ ld1 {v0.8h}, [x2] // px
+.else
+ add x12, x2, #2*8
+ ld1 {v0.4h}, [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.endif
+
+ movi v1.8h, #0 // sum
+.if \min
+ mov v2.16b, v0.16b // min
+ mov v3.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px v4, v5, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px v6, v7, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px v4, v5, \w
+
+ handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
+
+ handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ cmlt v4.8h, v1.8h, #0 // -(sum < 0)
+ add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
+ srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
+ add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
+.if \min
+ smin v0.8h, v0.8h, v3.8h
+ smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ xtn v0.8b, v0.8h
+.endif
+.if \w == 8
+ add x2, x2, #2*16 // tmp += tmp_stride
+ subs w7, w7, #1 // h--
+.if \bpc == 8
+ st1 {v0.8b}, [x0], x1
+.else
+ st1 {v0.8h}, [x0], x1
+.endif
+.else
+.if \bpc == 8
+ st1 {v0.s}[0], [x0], x1
+.else
+ st1 {v0.d}[0], [x0], x1
+.endif
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+.if \bpc == 8
+ st1 {v0.s}[1], [x0], x1
+.else
+ st1 {v0.d}[1], [x0], x1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ cbnz w3, 1f // pri_strength
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cbnz w4, 1f // sec_strength
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+ smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v23.4s, \s1\().8h, \s1\().8h
+ smull v24.4s, \s2\().4h, \s2\().4h
+ smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v26.4s, \s3\().8h, \s3\().8h
+ smull v27.4s, \s4\().4h, \s4\().4h
+ mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
+ mla v22.4s, v23.4s, v30.4s
+ mla v22.4s, v24.4s, v31.4s
+ mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
+ mla v25.4s, v26.4s, v30.4s
+ mla v25.4s, v27.4s, v31.4s
+ addv \d1, v22.4s // *cost_ptr
+ addv \d2, v25.4s // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ mov w5, \s2\().s[0]
+.endif
+ cmp w4, w1 // cost[n] > best_cost
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w4, w1, gt // best_cost = cost[n]
+.ifnb \s2
+ add w3, w3, #1 // n++
+ cmp w5, w1 // cost[n] > best_cost
+ mov w4, \s3\().s[0]
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w5, w1, gt // best_cost = cost[n]
+ add w3, w3, #1 // n++
+.endif
+.endm
+
+// Steps for loading and preparing each row
+.macro dir_load_step1 s1, bpc
+.if \bpc == 8
+ ld1 {\s1\().8b}, [x0], x1
+.else
+ ld1 {\s1\().8h}, [x0], x1
+.endif
+.endm
+
+.macro dir_load_step2 s1, bpc
+.if \bpc == 8
+ usubl \s1\().8h, \s1\().8b, v31.8b
+.else
+ ushl \s1\().8h, \s1\().8h, v8.8h
+.endif
+.endm
+
+.macro dir_load_step3 s1, bpc
+// Nothing for \bpc == 8
+.if \bpc != 8
+ sub \s1\().8h, \s1\().8h, v31.8h
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+ str d8, [sp, #-0x10]!
+ clz w3, w3 // clz(bitdepth_max)
+ sub w3, w3, #24 // -bitdepth_min_8
+ dup v8.8h, w3
+.endif
+ sub sp, sp, #32 // cost
+ mov w3, #8
+.if \bpc == 8
+ movi v31.16b, #128
+.else
+ movi v31.8h, #128
+.endif
+ movi v30.16b, #0
+ movi v1.8h, #0 // v0-v1 sum_diag[0]
+ movi v3.8h, #0 // v2-v3 sum_diag[1]
+ movi v5.8h, #0 // v4-v5 sum_hv[0-1]
+ movi v7.8h, #0 // v6-v7 sum_alt[0]
+ dir_load_step1 v26, \bpc // Setup first row early
+ movi v17.8h, #0 // v16-v17 sum_alt[1]
+ movi v18.8h, #0 // v18-v19 sum_alt[2]
+ dir_load_step2 v26, \bpc
+ movi v19.8h, #0
+ dir_load_step3 v26, \bpc
+ movi v21.8h, #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+ addv h25, v26.8h // [y]
+ rev64 v27.8h, v26.8h
+ addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
+ add v5.8h, v5.8h, v26.8h // sum_hv[1]
+ ext v27.16b, v27.16b, v27.16b, #8 // [-x]
+ rev64 v29.4h, v28.4h // [-(x >> 1)]
+ ins v4.h[\i], v25.h[0] // sum_hv[0]
+.if \i < 6
+ ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+ ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+ add v18.8h, v18.8h, v22.8h // sum_alt[2]
+ add v19.4h, v19.4h, v23.4h // sum_alt[2]
+.else
+ add v18.8h, v18.8h, v26.8h // sum_alt[2]
+.endif
+.if \i == 0
+ mov v20.16b, v26.16b // sum_alt[3]
+.elseif \i == 1
+ add v20.8h, v20.8h, v26.8h // sum_alt[3]
+.else
+ ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+ ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+ add v20.8h, v20.8h, v24.8h // sum_alt[3]
+ add v21.4h, v21.4h, v25.4h // sum_alt[3]
+.endif
+.if \i == 0
+ mov v0.16b, v26.16b // sum_diag[0]
+ dir_load_step1 v26, \bpc
+ mov v2.16b, v27.16b // sum_diag[1]
+ dir_load_step2 v26, \bpc
+ mov v6.16b, v28.16b // sum_alt[0]
+ dir_load_step3 v26, \bpc
+ mov v16.16b, v29.16b // sum_alt[1]
+.else
+ ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
+ ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
+ ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
+.if \i != 7 // Nothing to load for the final row
+ dir_load_step1 v26, \bpc // Start setting up the next row early.
+.endif
+ add v0.8h, v0.8h, v22.8h // sum_diag[0]
+ add v1.8h, v1.8h, v23.8h // sum_diag[0]
+ add v2.8h, v2.8h, v24.8h // sum_diag[1]
+ add v3.8h, v3.8h, v25.8h // sum_diag[1]
+.if \i != 7
+ dir_load_step2 v26, \bpc
+.endif
+ ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
+ ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
+ ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
+.if \i != 7
+ dir_load_step3 v26, \bpc
+.endif
+ add v6.8h, v6.8h, v22.8h // sum_alt[0]
+ add v7.4h, v7.4h, v23.4h // sum_alt[0]
+ add v16.8h, v16.8h, v24.8h // sum_alt[1]
+ add v17.4h, v17.4h, v25.4h // sum_alt[1]
+.endif
+.endr
+
+ movi v31.4s, #105
+
+ smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
+ smlal2 v26.4s, v4.8h, v4.8h
+ smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
+ smlal2 v27.4s, v5.8h, v5.8h
+ mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
+ mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
+ addv s4, v26.4s // cost[2]
+ addv s5, v27.4s // cost[6]
+
+ rev64 v1.8h, v1.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
+ ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
+
+ str s4, [sp, #2*4] // cost[2]
+ str s5, [sp, #6*4] // cost[6]
+
+ movrel x4, div_table
+ ld1 {v31.8h}, [x4]
+
+ smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
+ smull2 v23.4s, v0.8h, v0.8h
+ smlal v22.4s, v1.4h, v1.4h
+ smlal2 v23.4s, v1.8h, v1.8h
+ smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
+ smull2 v25.4s, v2.8h, v2.8h
+ smlal v24.4s, v3.4h, v3.4h
+ smlal2 v25.4s, v3.8h, v3.8h
+ uxtl v30.4s, v31.4h // div_table
+ uxtl2 v31.4s, v31.8h
+ mul v22.4s, v22.4s, v30.4s // cost[0]
+ mla v22.4s, v23.4s, v31.4s // cost[0]
+ mul v24.4s, v24.4s, v30.4s // cost[4]
+ mla v24.4s, v25.4s, v31.4s // cost[4]
+ addv s0, v22.4s // cost[0]
+ addv s2, v24.4s // cost[4]
+
+ movrel x5, alt_fact
+ ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+ str s0, [sp, #0*4] // cost[0]
+ str s2, [sp, #4*4] // cost[4]
+
+ uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
+ uxtl v30.4s, v30.4h
+ uxtl v31.4s, v31.4h
+
+ cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
+ cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
+ str s6, [sp, #1*4] // cost[1]
+ str s16, [sp, #3*4] // cost[3]
+
+ mov w0, #0 // best_dir
+ mov w1, v0.s[0] // best_cost
+ mov w3, #1 // n
+
+ str s18, [sp, #5*4] // cost[5]
+ str s20, [sp, #7*4] // cost[7]
+
+ mov w4, v6.s[0]
+
+ find_best v6, v4, v16
+ find_best v16, v2, v18
+ find_best v18, v5, v20
+ find_best v20
+
+ eor w3, w0, #4 // best_dir ^4
+ ldr w4, [sp, w3, uxtw #2]
+ sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
+ lsr w1, w1, #10
+ str w1, [x2] // *var
+
+ add sp, sp, #32
+.if \bpc == 16
+ ldr d8, [sp], 0x10
+.endif
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S
new file mode 100644
index 0000000000..aa7f18bf39
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@@ -0,0 +1,2010 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r2\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r3\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r3\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r4\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r4\().16b, \r5\().8h
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {\r5\().h}[0], [x14]
+ ld1 {\r5\().h}[1], [x15]
+ srshl v0.4h, \r5\().4h, v31.4h
+ xtn \r5\().8b, v0.8h
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+.macro get_grain_row_44 r0, r1, r2
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r2\().8h
+
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn2 \r2\().16b, v0.8h
+.endm
+
+.macro store_grain_row_44 r0, r1, r2
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b}, [x0]
+ add x0, x0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn v0.8b, v0.8h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #1
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.b[15], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ smull v2.8h, v3.8b, v28.8b
+ smull2 v3.8h, v3.16b, v28.16b
+ smull v4.8h, v0.8b, v27.8b
+ smull2 v5.8h, v0.16b, v27.16b
+ smull v6.8h, v1.8b, v29.8b
+ smull2 v7.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v4.4h
+ saddl2 v1.4s, v2.8h, v4.8h
+ saddl v2.4s, v3.4h, v5.4h
+ saddl2 v3.4s, v3.8h, v5.8h
+ saddw v4.4s, v0.4s, v6.4h
+ saddw2 v5.4s, v1.4s, v6.8h
+ saddw v6.4s, v2.4s, v7.4h
+ saddw2 v7.4s, v3.4s, v7.8h
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ ld1 {v24.16b, v25.16b}, [x12]
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ saddlp v24.8h, v24.16b
+ saddlp v25.8h, v25.16b
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ rshrn v0.8b, v22.8h, #2
+ rshrn2 v0.16b, v23.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ rshrn v0.8b, v22.8h, #1
+ rshrn2 v0.16b, v23.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.16b}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.16b, \uv_coeff
+ smull v2.8h, v0.8b, v1.8b
+ smull2 v3.8h, v0.16b, v1.16b
+.else
+ smull v2.8h, v0.8b, v30.8b
+ smull2 v3.8h, v0.16b, v30.16b
+.endif
+ saddw v4.4s, v4.4s, v2.4h
+ saddw2 v5.4s, v5.4s, v2.8h
+ saddw v6.4s, v6.4s, v3.4h
+ saddw2 v7.4s, v7.4s, v3.8h
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ xtn2 v0.16b, v0.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.b[13]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.b[14]
+.endif
+ smov w14, v0.b[15]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v6.16b
+.if \elems == 9
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ xtn v1.8b, v1.8h
+ ext v0.16b, v0.16b, v1.16b, #7
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v7.16b
+
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #1
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ st1 {v0.16b}, [x0], #16
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ mov v3.16b, \mid\().16b
+ ext v0.16b, \left\().16b, \mid\().16b, #15
+ ext v1.16b, \mid\().16b, \right\().16b, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ mov \dst\().16b, v0.16b
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v18.16b}, [x12] // load top right
+ ld1 {v21.16b}, [x13]
+
+ ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[0]
+ ext v23.16b, v16.16b, v17.16b, #15
+ dup v27.16b, v30.b[1]
+ ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[3]
+ ext v1.16b, v17.16b, v18.16b, #2
+ dup v29.16b, v30.b[4]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v4.8h, v23.8b, v27.8b
+ smull2 v5.8h, v23.16b, v27.16b
+ smull v6.8h, v0.8b, v28.8b
+ smull2 v7.8h, v0.16b, v28.16b
+ smull v0.8h, v1.8b, v29.8b
+ smull2 v1.8h, v1.16b, v29.16b
+ saddl v22.4s, v2.4h, v4.4h
+ saddl2 v23.4s, v2.8h, v4.8h
+ saddl v26.4s, v3.4h, v5.4h
+ saddl2 v27.4s, v3.8h, v5.8h
+ saddl v2.4s, v0.4h, v6.4h
+ saddl2 v3.4s, v0.8h, v6.8h
+ saddl v6.4s, v1.4h, v7.4h
+ saddl2 v7.4s, v1.8h, v7.8h
+ add v4.4s, v22.4s, v2.4s
+ add v5.4s, v23.4s, v3.4s
+ add v6.4s, v26.4s, v6.4s
+ add v7.4s, v27.4s, v7.4s
+
+ ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[5]
+ ext v23.16b, v19.16b, v20.16b, #15
+ dup v27.16b, v30.b[6]
+ ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[8]
+ ext v1.16b, v20.16b, v21.16b, #2
+ dup v29.16b, v30.b[9]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v22.8h, v23.8b, v27.8b
+ smull2 v23.8h, v23.16b, v27.16b
+ smull v26.8h, v0.8b, v28.8b
+ smull2 v27.8h, v0.16b, v28.16b
+ smull v28.8h, v1.8b, v29.8b
+ smull2 v29.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v22.4h
+ saddl2 v1.4s, v2.8h, v22.8h
+ saddl v2.4s, v3.4h, v23.4h
+ saddl2 v3.4s, v3.8h, v23.8h
+ saddl v22.4s, v26.4h, v28.4h
+ saddl2 v23.4s, v26.8h, v28.8h
+ saddl v26.4s, v27.4h, v29.4h
+ saddl2 v27.4s, v27.8h, v29.8h
+ add v0.4s, v0.4s, v22.4s
+ add v1.4s, v1.4s, v23.4s
+ add v2.4s, v2.4s, v26.4s
+ add v3.4s, v3.4s, v27.4s
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ smull v22.8h, v17.8b, v26.8b
+ smull2 v23.8h, v17.16b, v26.16b
+ smull v24.8h, v20.8b, v27.8b
+ smull2 v25.8h, v20.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ saddl v0.4s, v22.4h, v24.4h
+ saddl2 v1.4s, v22.8h, v24.8h
+ saddl v2.4s, v23.4h, v25.4h
+ saddl2 v3.4s, v23.8h, v25.8h
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v17.16b}, [x12] // load the previous block right above
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH - 16
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v15.16b}, [x11] // load top right
+ ld1 {v18.16b}, [x12]
+ ld1 {v21.16b}, [x13]
+
+ ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[0]
+ ext v9.16b, v13.16b, v14.16b, #14
+ dup v23.16b, v29.b[1]
+ ext v10.16b, v13.16b, v14.16b, #15
+ dup v24.16b, v29.b[2]
+ dup v25.16b, v29.b[3]
+ ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[4]
+ ext v12.16b, v14.16b, v15.16b, #2
+ dup v27.16b, v29.b[5]
+ ext v13.16b, v14.16b, v15.16b, #3
+ dup v28.16b, v29.b[6]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v14.8b, v25.8b
+ smull2 v13.8h, v14.16b, v25.16b
+ add v4.4s, v22.4s, v0.4s
+ add v5.4s, v23.4s, v1.4s
+ add v6.4s, v24.4s, v2.4s
+ add v7.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[7]
+ ext v9.16b, v16.16b, v17.16b, #14
+ dup v23.16b, v29.b[8]
+ ext v10.16b, v16.16b, v17.16b, #15
+ dup v24.16b, v29.b[9]
+ dup v25.16b, v29.b[10]
+ ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[11]
+ ext v12.16b, v17.16b, v18.16b, #2
+ dup v27.16b, v29.b[12]
+ ext v13.16b, v17.16b, v18.16b, #3
+ dup v28.16b, v29.b[13]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v17.8b, v25.8b
+ smull2 v13.8h, v17.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[14]
+ ext v9.16b, v19.16b, v20.16b, #14
+ dup v23.16b, v29.b[15]
+ ext v10.16b, v19.16b, v20.16b, #15
+ dup v24.16b, v30.b[0]
+ dup v25.16b, v30.b[1]
+ ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v26.16b, v30.b[2]
+ ext v12.16b, v20.16b, v21.16b, #2
+ dup v27.16b, v30.b[3]
+ ext v13.16b, v20.16b, v21.16b, #3
+ dup v28.16b, v30.b[4]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v20.8b, v25.8b
+ smull2 v19.8h, v20.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v19.4h
+ saddw2 v7.4s, v7.4s, v19.8h
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v14.16b}, [x11] // load the previous block right above
+ ld1 {v17.16b}, [x12]
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row v16, v17, v18, v19, v20, v21
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row_44 v16, v17, v18
+ subs w1, w1, #1
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row v16, v17, v18, v19, v20, v21
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row_44 v16, v17, v18
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function add_uv_444_coeff_lag0_neon
+add_coeff_lag0_start:
+ smull v2.8h, v0.8b, v27.8b
+ smull2 v3.8h, v0.16b, v27.16b
+ srshl v2.8h, v2.8h, v28.8h
+ srshl v3.8h, v3.8h, v28.8h
+ saddw v2.8h, v2.8h, v1.8b
+ saddw2 v3.8h, v3.8h, v1.16b
+ sqxtn v2.8b, v2.8h
+ sqxtn2 v2.16b, v3.8h
+ ret
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ ld1 {v6.16b, v7.16b}, [x12], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ saddlp v6.8h, v6.16b
+ saddlp v7.8h, v7.16b
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ rshrn v4.8b, v4.8h, #2
+ rshrn2 v4.16b, v5.8h, #2
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ rshrn v4.8b, v4.8h, #1
+ rshrn2 v4.16b, v5.8h, #1
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH
+ mov x1, x2
+ mul w13, w13, w14
+.endif
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #1
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
+ bl get_grain_row_neon
+ and v0.16b, v22.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v0.16b, v23.16b
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ ld1 {v26.16b}, [x19], #16
+ mov v0.16b, v24.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ add x19, x19, #2
+ mov v0.16b, v25.16b
+ mov v1.16b, v19.16b
+ mov v18.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ and v0.16b, v26.16b, v30.16b
+ mov v1.16b, v20.16b
+ mov v19.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v20.16b, v2.16b
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 v22, v16, v16, v17, left
+ sum_\type\()_lag1 v23, v16, v17, v18
+ sum_\type\()_lag1 v24, v17, v18, v19
+ sum_\type\()_lag1 v25, v18, v19, v20
+ sum_\type\()_lag1 v20, v19, v20, v21, right
+ get_grain_2 v21
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ store_grain_row v22, v23, v24, v25, v20, v21
+ mov v16.16b, v22.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ mov v19.16b, v25.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH-3
+ mov x1, x2
+ mul w13, w13, w14
+
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #7
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+.endif
+ mov v0.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_\type\()_coeff_lag0_neon
+ movi v0.16b, #255
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v0.16b, v30.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v18.16b, v2.16b
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ sum_\type\()_lag1 v20, v16, v16, v17, left
+ sum_\type\()_lag1 v21, v16, v17, v18
+ sum_\type\()_lag1 v18, v17, v18, v18, right
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v20, v21, v18
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0+\off]
+ umov w15, \src2[8+\off]
+ umov w16, \src1[2+\off]
+ add x14, x14, x3
+ umov w17, \src2[10+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4+\off]
+ add x16, x16, x3
+ ld1 {\dst2}[8+\off], [x15]
+ umov w15, \src2[12+\off]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6+\off]
+ add x14, x14, x3
+ ld1 {\dst2}[10+\off], [x17]
+ umov w17, \src2[14+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[12+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[14+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2
+ gather_interleaved \dst1, \dst2, \src1, \src2, 0
+ gather_interleaved \dst2, \dst1, \src2, \src1, 0
+ gather_interleaved \dst1, \dst2, \src1, \src2, 1
+ gather_interleaved \dst2, \dst1, \src2, \src1, 1
+.endm
+
+function gather32_neon
+ gather v4.b, v5.b, v0.b, v1.b
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
+ ins v4.d[1], v5.d[1]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w6, [x6] // offsets[0][0]
+ ldr w8, [sp, #16] // clip
+ mov x9, #GRAIN_WIDTH // grain_lut stride
+
+ neg w4, w4
+ dup v29.8h, w4 // -scaling_shift
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+
+ add x5, x5, #9 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #24] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v6.16b, v27.b[0]
+ dup v7.16b, v27.b[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x8], x9 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v7.8b
+.else
+ smull v16.8h, v18.8b, v7.8b
+.endif
+ smull2 v17.8h, v18.16b, v7.16b
+ smull v18.8h, v19.8b, v7.8b
+ smull2 v19.8h, v19.16b, v7.16b
+.if \ox
+ smlal v16.8h, v21.8b, v6.8b
+.else
+ smlal v16.8h, v22.8b, v6.8b
+.endif
+ smlal2 v17.8h, v22.16b, v6.16b
+ smlal v18.8h, v23.8b, v6.8b
+ smlal2 v19.8h, v23.16b, v6.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v0.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v0.16b
+ uaddw v18.8h, v18.8h, v1.8b
+ uaddw2 v19.8h, v19.8h, v1.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w7, w7, #1
+.if \oy
+ dup v6.16b, v28.b[0]
+ dup v7.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-32]!
+ str d8, [sp, #16]
+ ldp x8, x9, [sp, #32] // offsets, h
+ ldp x10, x11, [sp, #48] // uv, is_id
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg w13, w13 // -scaling_shift
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ ld1 {v8.h}[0], [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1 {v8.h}[1], [x15] // uv_mult
+
+ dup v29.8h, w13 // -scaling_shift
+
+ cbz w12, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #240
+ cbz w11, 2f
+ // is_id
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH // grain_lut stride
+
+ add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ ldr w13, [sp, #64] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.16b, #23
+ movi v26.16b, #22
+.else
+ movi v25.16b, #27
+ movi v26.16b, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b, v7.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut
+
+.if !\csfl
+ uxtl v2.8h, v0.8b
+ uxtl2 v3.8h, v0.16b
+ uxtl v4.8h, v1.8b
+ uxtl2 v5.8h, v1.16b
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ uxtl v16.8h, v7.8b
+ uxtl2 v17.8h, v7.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v4.8h, v4.8h, v8.h[0]
+ mul v5.8h, v5.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ mul v16.8h, v16.8h, v8.h[1]
+ mul v17.8h, v17.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v17.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ sshr v4.8h, v4.8h, #6
+ sshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ add v4.8h, v4.8h, v24.8h
+ add v5.8h, v5.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+ sqxtun v1.8b, v4.8h
+ sqxtun2 v1.16b, v5.8h
+.endif
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+ smull v18.8h, v19.8b, v26.8b
+ smull2 v19.8h, v19.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ smlal v18.8h, v23.8b, v25.8b
+ smlal2 v19.8h, v23.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+ uaddw v18.8h, v18.8h, v7.8b
+ uaddw2 v19.8h, v19.8h, v7.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.16b, v28.b[0]
+ dup v26.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b}, [x5], x10 // grain_lut
+
+ uaddlp v2.8h, v0.16b
+ uaddlp v3.8h, v1.16b
+.if \csfl
+ rshrn v0.8b, v2.8h, #1
+ rshrn2 v0.16b, v3.8h, #1
+.else
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+.endif
+
+ bl gather16_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000..75252acfb1
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #2
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.h[7], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub x12, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+
+ ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid
+ ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right
+
+ smull v4.4s, v17.4h, v28.4h
+ smlal v4.4s, v0.4h, v27.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v17.8h, v28.8h
+ smlal2 v5.4s, v0.8h, v27.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH*2
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ ld1 {v24.8h, v25.8h}, [x12]
+ addp v22.8h, v22.8h, v23.8h
+ addp v23.8h, v24.8h, v25.8h
+ add v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ addp v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.8b, \uv_coeff
+ sxtl v1.8h, v1.8b
+ smlal v4.4s, v0.4h, v1.4h
+ smlal2 v5.4s, v0.8h, v1.8h
+.else
+ smlal v4.4s, v0.4h, v30.4h
+ smlal2 v5.4s, v0.8h, v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.h[6]
+.endif
+ smov w14, v0.h[7]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #2
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #14
+.endif
+ st1 {v0.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+ ld1 {v21.8h}, [x13]
+
+ dup v26.8b, v30.b[0]
+ ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+ dup v27.8b, v30.b[1]
+ ext v23.16b, v16.16b, v17.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[3]
+ ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.8b, v30.b[4]
+ ext v1.16b, v17.16b, v18.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smull v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[5]
+ ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+ dup v27.16b, v30.b[6]
+ ext v23.16b, v19.16b, v20.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.16b, v30.b[8]
+ ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.16b, v30.b[9]
+ ext v1.16b, v20.16b, v21.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smlal v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smlal2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ sxtl v26.8h, v26.8b
+ sxtl v27.8h, v27.8b
+
+ smlal v4.4s, v17.4h, v26.4h
+ smlal v4.4s, v20.4h, v27.4h
+ smlal2 v5.4s, v17.8h, v26.8h
+ smlal2 v5.4s, v20.8h, v27.8h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH*2 - 16
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v15.8h}, [x11] // load top right
+ ld1 {v18.8h}, [x12]
+ ld1 {v21.8h}, [x13]
+
+ dup v22.8b, v29.b[0]
+ ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[1]
+ ext v9.16b, v13.16b, v14.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[2]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[3]
+ ext v10.16b, v13.16b, v14.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[4]
+ ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[5]
+ ext v12.16b, v14.16b, v15.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[6]
+ ext v13.16b, v14.16b, v15.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smull v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v14.4h, v25.4h
+ smull2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v14.8h, v25.8h
+
+ dup v22.8b, v29.b[7]
+ ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[8]
+ ext v9.16b, v16.16b, v17.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[9]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[10]
+ ext v10.16b, v16.16b, v17.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[11]
+ ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[12]
+ ext v12.16b, v17.16b, v18.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[13]
+ ext v13.16b, v17.16b, v18.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v17.4h, v25.4h
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v17.8h, v25.8h
+
+ dup v22.8b, v29.b[14]
+ ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[15]
+ ext v9.16b, v19.16b, v20.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v30.b[0]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v30.b[1]
+ ext v10.16b, v19.16b, v20.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v30.b[2]
+ ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v30.b[3]
+ ext v12.16b, v20.16b, v21.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[4]
+ ext v13.16b, v20.16b, v21.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v20.4h, v25.4h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v20.8h, v25.8h
+
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH*2
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v14.8h}, [x11] // load the previous block right above
+ ld1 {v17.8h}, [x12]
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #80
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_2 v0
+ subs w1, w1, #1
+ st1 {v0.s}[0], [x0], #4
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #40
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_4 v0
+ subs w1, w1, #1
+ st1 {v0.4h}, [x0]
+ add x0, x0, #GRAIN_WIDTH*2-80
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+gen_grain_uv_lag0_8_add:
+ and v4.16b, v4.16b, v1.16b
+ smull v2.4s, v4.4h, v27.4h
+ smull2 v3.4s, v4.8h, v27.8h
+ srshl v2.4s, v2.4s, v28.4s
+ srshl v3.4s, v3.4s, v28.4s
+ sqxtn v2.4h, v2.4s
+ sqxtn2 v2.8h, v3.4s
+ sqadd v2.8h, v2.8h, v0.8h
+ smin v2.8h, v2.8h, v25.8h
+ smax v2.8h, v2.8h, v26.8h
+ st1 {v2.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ add x12, x19, #GRAIN_WIDTH*2
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ ld1 {v18.8h, v19.8h}, [x12]
+ addp v16.8h, v16.8h, v17.8h
+ addp v17.8h, v18.8h, v19.8h
+ add v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ addp v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add x12, x19, #GRAIN_WIDTH*2
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ ld1 {v18.4h, v19.4h}, [x12]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ addp v17.4h, v18.4h, v19.4h
+ add v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #2
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #1
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+.else
+ clz w15, w2
+.endif
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #2
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_uv_444_lag0_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ mov v1.16b, v30.16b
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+ add x19, x19, #4
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.8b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+.ifc \type, uv_444
+ sxtl v30.8h, v30.8b
+.endif
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #(3*GRAIN_WIDTH-3)*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #14
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ mov v1.16b, v30.16b
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.8b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+ sxtl v30.8h, v30.8b
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0]
+ umov w15, \src2[1]
+ umov w16, \src1[2]
+ add x14, x14, x3
+ umov w17, \src2[3]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4]
+ add x16, x16, x3
+ ld1 {\dst2}[1+\off], [x15]
+ umov w15, \src2[5]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6]
+ add x14, x14, x3
+ ld1 {\dst2}[3+\off], [x17]
+ umov w17, \src2[7]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[5+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst2, \src1, \src3, 0
+ gather_interleaved \dst2, \dst1, \src3, \src1, 0
+ gather_interleaved \dst1, \dst2, \src2, \src4, 8
+ gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+ gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+ gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+ ins v6.d[1], v7.d[0]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ str d14, [sp, #64]
+ eor w4, w4, #15 // 15 - scaling_shift
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w10, [sp, #96] // bitdepth_max
+ ldr w6, [x6] // offsets[0][0]
+ dup v26.8h, w10 // bitdepth_max
+ clz w10, w10
+ ldr w8, [sp, #80] // clip
+ sub w10, w10, #24 // -bitdepth_min_8
+ mov x9, #GRAIN_WIDTH*2 // grain_lut stride
+ neg w10, w10 // bitdepth_min_8
+
+ dup v29.8h, w4 // 15 - scaling_shift
+ dup v27.8h, w10 // bitdepth_min_8
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #235
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v26.16b // bitdepth_max
+2:
+
+ ushr v26.8h, v26.8h, #1 // grain_max
+ not v25.16b, v26.16b // grain_min
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+ add x5, x5, #18 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #88] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v8.8h, v27.h[0]
+ dup v9.8h, v27.h[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src
+.if \ox
+ ld1 {v20.4h}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v14.4h}, [x8], x9 // grain_lut top old
+.endif
+ mvni v4.8h, #0xf0, lsl #8 // 0x0fff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v4.16b
+ and v1.16b, v1.16b, v4.16b
+ and v2.16b, v2.16b, v4.16b
+ and v3.16b, v3.16b, v4.16b
+ bl gather32_neon
+
+.if \ox
+ smull v20.4s, v20.4h, v27.4h
+ smlal v20.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v14.4s, v14.4h, v27.4h
+ smlal v14.4s, v21.4h, v28.4h
+ sqrshrn v20.4h, v20.4s, #5
+ sqrshrn v14.4h, v14.4s, #5
+ smin v20.4h, v20.4h, v26.4h
+ smin v14.4h, v14.4h, v26.4h
+ smax v20.4h, v20.4h, v25.4h
+ smax v14.4h, v14.4h, v25.4h
+.endif
+
+.if \ox
+ smull v10.4s, v20.4h, v9.4h
+.else
+ smull v10.4s, v16.4h, v9.4h
+.endif
+ smull2 v11.4s, v16.8h, v9.8h
+ smull v12.4s, v17.4h, v9.4h
+ smull2 v13.4s, v17.8h, v9.8h
+ smull v16.4s, v18.4h, v9.4h
+ smull2 v17.4s, v18.8h, v9.8h
+ smull v18.4s, v19.4h, v9.4h
+ smull2 v19.4s, v19.8h, v9.8h
+.if \ox
+ smlal v10.4s, v14.4h, v8.4h
+.else
+ smlal v10.4s, v21.4h, v8.4h
+.endif
+ smlal2 v11.4s, v21.8h, v8.8h
+ smlal v12.4s, v22.4h, v8.4h
+ smlal2 v13.4s, v22.8h, v8.8h
+ smlal v16.4s, v23.4h, v8.4h
+ smlal2 v17.4s, v23.8h, v8.8h
+ smlal v18.4s, v24.4h, v8.4h
+ smlal2 v19.4s, v24.8h, v8.8h
+ sqrshrn v10.4h, v10.4s, #5
+ sqrshrn2 v10.8h, v11.4s, #5
+ sqrshrn v11.4h, v12.4s, #5
+ sqrshrn2 v11.8h, v13.4s, #5
+ sqrshrn v12.4h, v16.4s, #5
+ sqrshrn2 v12.8h, v17.4s, #5
+ sqrshrn v13.4h, v18.4s, #5
+ sqrshrn2 v13.8h, v19.4s, #5
+ smin v16.8h, v10.8h, v26.8h
+ smin v17.8h, v11.8h, v26.8h
+ smin v18.8h, v12.8h, v26.8h
+ smin v19.8h, v13.8h, v26.8h
+ smax v16.8h, v16.8h, v25.8h
+ smax v17.8h, v17.8h, v25.8h
+ smax v18.8h, v18.8h, v25.8h
+ smax v19.8h, v19.8h, v25.8h
+.endif
+
+ uxtl v4.8h, v6.8b // scaling
+.if \ox && !\oy
+ sqrshrn v20.4h, v20.4s, #5
+.endif
+ uxtl2 v5.8h, v6.16b
+.if \ox && !\oy
+ smin v20.4h, v20.4h, v26.4h
+.endif
+ uxtl v6.8h, v7.8b
+.if \ox && !\oy
+ smax v20.4h, v20.4h, v25.4h
+.endif
+ uxtl2 v7.8h, v7.16b
+.if \ox && !\oy
+ ins v16.d[0], v20.d[0]
+.endif
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v21.8h, v17.8h, v5.8h
+ sqrdmulh v22.8h, v18.8h, v6.8h
+ sqrdmulh v23.8h, v19.8h, v7.8h
+
+ usqadd v0.8h, v20.8h // *src + noise
+ usqadd v1.8h, v21.8h
+ usqadd v2.8h, v22.8h
+ usqadd v3.8h, v23.8h
+
+ umax v0.8h, v0.8h, v30.8h
+ umax v1.8h, v1.8h, v30.8h
+ umax v2.8h, v2.8h, v30.8h
+ umax v3.8h, v3.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w7, w7, #1
+.if \oy
+ dup v8.8h, v28.h[0]
+ dup v9.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr d14, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldp x8, x9, [sp, #80] // offsets, h
+ ldp x10, x11, [sp, #96] // uv, is_id
+ ldr w16, [sp, #120] // bitdepth_max
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ dup v23.8h, w16 // bitdepth_max
+ clz w16, w16
+ eor w13, w13, #15 // 15 - scaling_shift
+ sub w16, w16, #24 // -bitdepth_min_8
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ neg w16, w16 // bitdepth_min_8
+ ld1r {v8.8h}, [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1r {v9.8h}, [x15] // uv_mult
+
+ dup v29.8h, w13 // 15 - scaling_shift
+ dup v27.8h, w16 // bitdepth_min_8
+
+ cbz w12, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #240
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ cbz w11, 2f
+ // is_id
+ movi v31.8h, #235
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v23.16b // bitdepth_max
+2:
+
+ ushr v15.8h, v23.8h, #1 // grain_max
+ sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
+ not v14.16b, v15.16b // grain_min
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH*2 // grain_lut stride
+
+ add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+ add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+
+ ldr w13, [sp, #112] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.8h, #23
+ movi v26.8h, #22
+.else
+ movi v25.8h, #27
+ movi v26.8h, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v4.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v5.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v4.4s, v4.4h, v27.4h
+ smlal v4.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v5.4s, v5.4h, v27.4h
+ smlal v5.4s, v0.4h, v28.4h
+ sqrshrn v4.4h, v4.4s, #5
+ sqrshrn v5.4h, v5.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+ smin v5.4h, v5.4h, v15.4h
+ smax v4.4h, v4.4h, v14.4h
+ smax v5.4h, v5.4h, v14.4h
+ ins v16.d[0], v4.d[0]
+ ins v0.d[0], v5.d[0]
+.endif
+
+ smull v6.4s, v16.4h, v26.4h
+ smull2 v7.4s, v16.8h, v26.8h
+ smull v10.4s, v17.4h, v26.4h
+ smull2 v11.4s, v17.8h, v26.8h
+ smull v16.4s, v18.4h, v26.4h
+ smull2 v17.4s, v18.8h, v26.8h
+ smull v18.4s, v19.4h, v26.4h
+ smull2 v19.4s, v19.8h, v26.8h
+ smlal v6.4s, v0.4h, v25.4h
+ smlal2 v7.4s, v0.8h, v25.8h
+ smlal v10.4s, v1.4h, v25.4h
+ smlal2 v11.4s, v1.8h, v25.8h
+ smlal v16.4s, v2.4h, v25.4h
+ smlal2 v17.4s, v2.8h, v25.8h
+ smlal v18.4s, v3.4h, v25.4h
+ smlal2 v19.4s, v3.8h, v25.8h
+ sqrshrn v6.4h, v6.4s, #5
+ sqrshrn2 v6.8h, v7.4s, #5
+ sqrshrn v7.4h, v10.4s, #5
+ sqrshrn2 v7.8h, v11.4s, #5
+ sqrshrn v10.4h, v16.4s, #5
+ sqrshrn2 v10.8h, v17.4s, #5
+ sqrshrn v11.4h, v18.4s, #5
+ sqrshrn2 v11.8h, v19.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v4.4h, v4.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v6.8h, v15.8h
+ smin v17.8h, v7.8h, v15.8h
+ smin v18.8h, v10.8h, v15.8h
+ smin v19.8h, v11.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+ smax v18.8h, v18.8h, v14.8h
+ smax v19.8h, v19.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v4.4h, v4.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+ smull v4.4s, v0.4h, v8.4h
+ smull2 v5.4s, v0.8h, v8.8h
+ smull v6.4s, v1.4h, v8.4h
+ smull2 v7.4s, v1.8h, v8.8h
+ smull v0.4s, v2.4h, v8.4h
+ smull2 v1.4s, v2.8h, v8.8h
+ smull v2.4s, v3.4h, v8.4h
+ smull2 v3.4s, v3.8h, v8.8h
+ smlal v4.4s, v10.4h, v9.4h
+ smlal2 v5.4s, v10.8h, v9.8h
+ smlal v6.4s, v11.4h, v9.4h
+ smlal2 v7.4s, v11.8h, v9.8h
+ smlal v0.4s, v12.4h, v9.4h
+ smlal2 v1.4s, v12.8h, v9.8h
+ smlal v2.4s, v13.4h, v9.4h
+ smlal2 v3.4s, v13.8h, v9.8h
+ shrn v4.4h, v4.4s, #6
+ shrn2 v4.8h, v5.4s, #6
+ shrn v5.4h, v6.4s, #6
+ shrn2 v5.8h, v7.4s, #6
+ shrn v6.4h, v0.4s, #6
+ shrn2 v6.8h, v1.4s, #6
+ shrn v7.4h, v2.4s, #6
+ shrn2 v7.8h, v3.4s, #6
+ add v0.8h, v4.8h, v24.8h
+ add v1.8h, v5.8h, v24.8h
+ add v2.8h, v6.8h, v24.8h
+ add v3.8h, v7.8h, v24.8h
+ movi v20.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smin v2.8h, v2.8h, v23.8h
+ smin v3.8h, v3.8h, v23.8h
+ smax v0.8h, v0.8h, v20.8h
+ smax v1.8h, v1.8h, v20.8h
+ smax v2.8h, v2.8h, v20.8h
+ smax v3.8h, v3.8h, v20.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+ and v2.16b, v2.16b, v23.16b
+ and v3.16b, v3.16b, v23.16b
+.endif
+
+ bl gather32_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+ uxtl v6.8h, v7.8b
+ uxtl2 v7.8h, v7.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+ sqrdmulh v18.8h, v18.8h, v6.8h
+ sqrdmulh v19.8h, v19.8h, v7.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+ usqadd v12.8h, v18.8h
+ usqadd v13.8h, v19.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umax v2.8h, v12.8h, v30.8h
+ umax v3.8h, v13.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.8h, v28.h[0]
+ dup v26.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v18.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v19.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v18.4s, v18.4h, v27.4h
+ smlal v18.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v19.4s, v19.4h, v27.4h
+ smlal v19.4s, v20.4h, v28.4h
+ sqrshrn v18.4h, v18.4s, #5
+ sqrshrn v19.4h, v19.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+ smin v19.4h, v19.4h, v15.4h
+ smax v18.4h, v18.4h, v14.4h
+ smax v19.4h, v19.4h, v14.4h
+ ins v16.d[0], v18.d[0]
+ ins v20.d[0], v19.d[0]
+.endif
+
+ smull v0.4s, v16.4h, v26.4h
+ smull2 v1.4s, v16.8h, v26.8h
+ smull v2.4s, v17.4h, v26.4h
+ smull2 v3.4s, v17.8h, v26.8h
+ smlal v0.4s, v20.4h, v25.4h
+ smlal2 v1.4s, v20.8h, v25.8h
+ smlal v2.4s, v21.4h, v25.4h
+ smlal2 v3.4s, v21.8h, v25.8h
+ sqrshrn v16.4h, v0.4s, #5
+ sqrshrn2 v16.8h, v1.4s, #5
+ sqrshrn v17.4h, v2.4s, #5
+ sqrshrn2 v17.8h, v3.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v18.4h, v18.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v16.8h, v15.8h
+ smin v17.8h, v17.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v18.4h, v18.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v18.d[0]
+.endif
+ addp v0.8h, v0.8h, v1.8h
+ addp v1.8h, v2.8h, v3.8h
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+.if !\csfl
+ smull v2.4s, v0.4h, v8.4h
+ smull2 v3.4s, v0.8h, v8.8h
+ smull v0.4s, v1.4h, v8.4h
+ smull2 v1.4s, v1.8h, v8.8h
+ smlal v2.4s, v10.4h, v9.4h
+ smlal2 v3.4s, v10.8h, v9.8h
+ smlal v0.4s, v11.4h, v9.4h
+ smlal2 v1.4s, v11.8h, v9.8h
+ shrn v2.4h, v2.4s, #6
+ shrn2 v2.8h, v3.4s, #6
+ shrn v3.4h, v0.4s, #6
+ shrn2 v3.8h, v1.4s, #6
+ add v0.8h, v2.8h, v24.8h
+ add v1.8h, v3.8h, v24.8h
+ movi v2.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smax v0.8h, v0.8h, v2.8h
+ smax v1.8h, v1.8h, v2.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+.endif
+
+ bl gather16_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.8h, v1.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S
new file mode 100644
index 0000000000..709238e2f8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred.S
@@ -0,0 +1,5294 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ movi v0.16b, #128
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+ movi v2.16b, #128
+ movi v3.16b, #128
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 16b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #4
+ sub x5, x5, w3, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v1.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.8b}, [x0], x1
+ st1 {v2.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v2.4h, v0.4h, v1.4h
+ rshrn v2.8b, v2.8h, #5
+ dup v0.16b, v2.b[0]
+ dup v1.16b, v2.b[0]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v4.4h, v0.4h, v1.4h
+ add v5.4h, v2.4h, v3.4h
+ add v4.4h, v4.4h, v5.4h
+ rshrn v4.8b, v4.8h, #6
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt L(ipred_dc_left_w16)
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add v0.4h, v0.4h, v2.4h
+ rshrn v0.8b, v0.8h, #6
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.8h, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.s}[0], [x2]
+ ins v1.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+2:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add x2, x2, #1
+ add v0.4h, v0.4h, v1.4h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v2.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add x2, x2, #1
+ add v0.4h, v0.4h, v2.4h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ uaddlv h4, v4.16b
+ add v1.4h, v1.4h, v2.4h
+ add v3.4h, v3.4h, v4.4h
+ cmp w4, #64
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 16/32
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ lsr w16, w16, w4
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x2]
+ add x8, x2, #1
+ sub x2, x2, #4
+ sub x5, x5, w9, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.4s}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ zip1 v0.2s, v0.2s, v1.2s
+ zip1 v2.2s, v2.2s, v3.2s
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v2.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ zip1 v0.2d, v0.2d, v2.2d
+ uabd v20.16b, v5.16b, v16.16b // tdiff
+ uabd v22.16b, v4.16b, v16.16b // tldiff
+ uabd v16.16b, v0.16b, v16.16b // ldiff
+ umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
+ cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff
+ cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
+ bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ...
+ st1 {v20.s}[3], [x0], x1
+ st1 {v20.s}[2], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.s}[1], [x0], x1
+ st1 {v20.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v1.8b
+ uaddw v18.8h, v6.8h, v2.8b
+ uaddw v19.8h, v6.8h, v3.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v0.2d, v0.2d, v1.2d
+ uabd v21.16b, v5.16b, v18.16b // tdiff
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v23.16b, v4.16b, v18.16b // tldiff
+ uabd v22.16b, v4.16b, v16.16b
+ uabd v17.16b, v2.16b, v18.16b // ldiff
+ uabd v16.16b, v0.16b, v16.16b
+ umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
+ umin v18.16b, v20.16b, v22.16b
+ cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff
+ cmhs v20.16b, v22.16b, v20.16b
+ cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v16.16b, v18.16b, v16.16b
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.16b}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+2:
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+ usubl2 v7.8h, v5.16b, v4.16b
+ uaddw v24.8h, v6.8h, v0.8b
+ uaddw v25.8h, v7.8h, v0.8b
+ uaddw v26.8h, v6.8h, v1.8b
+ uaddw v27.8h, v7.8h, v1.8b
+ uaddw v28.8h, v6.8h, v2.8b
+ uaddw v29.8h, v7.8h, v2.8b
+ uaddw v30.8h, v6.8h, v3.8b
+ uaddw v31.8h, v7.8h, v3.8b
+ sqxtun v17.8b, v26.8h // base
+ sqxtun2 v17.16b, v27.8h
+ sqxtun v16.8b, v24.8h
+ sqxtun2 v16.16b, v25.8h
+ sqxtun v19.8b, v30.8h
+ sqxtun2 v19.16b, v31.8h
+ sqxtun v18.8b, v28.8h
+ sqxtun2 v18.16b, v29.8h
+ uabd v23.16b, v5.16b, v19.16b // tdiff
+ uabd v22.16b, v5.16b, v18.16b
+ uabd v21.16b, v5.16b, v17.16b
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v27.16b, v4.16b, v19.16b // tldiff
+ uabd v26.16b, v4.16b, v18.16b
+ uabd v25.16b, v4.16b, v17.16b
+ uabd v24.16b, v4.16b, v16.16b
+ uabd v19.16b, v3.16b, v19.16b // ldiff
+ uabd v18.16b, v2.16b, v18.16b
+ uabd v17.16b, v1.16b, v17.16b
+ uabd v16.16b, v0.16b, v16.16b
+ umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
+ umin v30.16b, v22.16b, v26.16b
+ umin v29.16b, v21.16b, v25.16b
+ umin v28.16b, v20.16b, v24.16b
+ cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff
+ cmhs v22.16b, v26.16b, v22.16b
+ cmhs v21.16b, v25.16b, v21.16b
+ cmhs v20.16b, v24.16b, v20.16b
+ cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v18.16b, v30.16b, v18.16b
+ cmhs v17.16b, v29.16b, v17.16b
+ cmhs v16.16b, v28.16b, v16.16b
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ subs w3, w3, #16
+ st1 {v23.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v21.16b}, [x5], #16
+ st1 {v20.16b}, [x10], #16
+ b.le 8f
+ ld1 {v5.16b}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.16b}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x12] // bottom
+ add x8, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[3] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ uhadd v20.8h, v20.8h, v22.8h
+ uhadd v21.8h, v21.8h, v23.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[7] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw
+ sub x2, x2, #2
+ mov x7, #-2
+ ld1r {v5.16b}, [x12] // right
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld2r {v0.8b, v1.8b}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v3.16b}, [x8], #16 // top
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h // (left flipped)
+ mla v22.8h, v0.8h, v6.8h
+ mla v23.8h, v0.8h, v7.8h
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v3.8h, v16.8h
+ mla v26.8h, v2.8h, v17.8h
+ mla v27.8h, v3.8h, v17.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x8] // bottom
+ add x2, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ rshrn v24.8b, v24.8h, #8
+ rshrn v25.8b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn v27.8b, v27.8h, #8
+ st1 {v24.8b}, [x0], x1
+ st1 {v25.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v26.8b}, [x0], x1
+ st1 {v27.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+2:
+ ld1 {v3.16b}, [x2], #16 // top
+ shll v20.8h, v4.8b, #8 // bottom*256
+ shll v21.8h, v4.8b, #8
+ shll v22.8h, v4.8b, #8
+ shll v23.8h, v4.8b, #8
+ shll v24.8h, v4.8b, #8
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v21.8h, v3.8h, v16.8h
+ mla v22.8h, v2.8h, v17.8h
+ mla v23.8h, v3.8h, v17.8h
+ mla v24.8h, v2.8h, v18.8h
+ mla v25.8h, v3.8h, v18.8h
+ mla v26.8h, v2.8h, v19.8h
+ mla v27.8h, v3.8h, v19.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x8], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.16b}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v3.8h, v3.8b, v5.8b // left-right
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v0.8h, v0.8b, v5.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #4
+ mov x7, #-4
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ shll v24.8h, v5.8b, #8
+ shll v25.8h, v5.8b, #8
+ shll v26.8h, v5.8b, #8
+ shll v27.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v3.8h, v7.8h // (left flipped)
+ mla v22.8h, v2.8h, v6.8h
+ mla v23.8h, v2.8h, v7.8h
+ mla v24.8h, v1.8h, v6.8h
+ mla v25.8h, v1.8h, v7.8h
+ mla v26.8h, v0.8h, v6.8h
+ mla v27.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x10], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end);
+function ipred_z1_upsample_edge_8bpc_neon, export=1
+ movrel x4, padding_mask
+ ld1 {v0.16b}, [x2] // in[]
+ add x5, x2, w3, uxtw // in[end]
+ sub x4, x4, w3, uxtw
+
+ ld1r {v1.16b}, [x5] // padding
+ ld1 {v3.16b}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v1.16b, v3.16b // padded in[]
+
+ ext v4.16b, v0.16b, v1.16b, #1
+ ext v5.16b, v0.16b, v1.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #3
+
+ uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2]
+ uaddl2 v17.8h, v4.16b, v5.16b
+ uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3]
+ uaddl2 v19.8h, v0.16b, v6.16b
+ mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
+ mul v17.8h, v17.8h, v31.8h
+ sub v16.8h, v16.8h, v18.8h
+ sub v17.8h, v17.8h, v19.8h
+
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun2 v16.16b, v17.8h, #4
+
+ zip1 v0.16b, v4.16b, v16.16b
+ zip2 v1.16b, v4.16b, v16.16b
+
+ st1 {v0.16b, v1.16b}, [x0]
+
+ ret
+endfunc
+
+// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
+// const pixel *const in);
+function ipred_z2_upsample_edge_8bpc_neon, export=1
+ // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
+ movrel x4, padding_mask
+ ld1 {v0.16b}, [x2] // in[]
+ add x5, x2, w1, uxtw // in[sz]
+ sub x4, x4, w1, uxtw
+
+ ld1r {v2.16b}, [x2] // in[0] for padding
+ ld1r {v1.16b}, [x5] // padding
+ ld1 {v3.16b}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v1.16b, v3.16b // padded in[]
+
+ ext v4.16b, v2.16b, v0.16b, #15
+ ext v5.16b, v0.16b, v1.16b, #1
+ ext v6.16b, v0.16b, v1.16b, #2
+
+ uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1]
+ uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2]
+ mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
+ sub v16.8h, v16.8h, v18.8h
+
+ sqrshrun v16.8b, v16.8h, #4
+
+ add x5, x0, #16
+
+ zip1 v2.16b, v0.16b, v16.16b
+
+ st1 {v1.b}[0], [x5]
+ // In case sz=8, output one single pixel in out[16].
+ st1 {v2.16b}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .byte 0, 4, 8, 0
+ .byte 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_8bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -3
+ add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1
+
+ ld1 {v31.h}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.16b}, [x2], #16
+
+ dup v30.16b, v31.b[0]
+ dup v31.16b, v31.b[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ sub w3, w3, #16
+ st1 {v4.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #32
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b}, [x5] // padding_mask
+
+ ld1r {v1.16b}, [x6]
+ bit v0.16b, v1.16b, v2.16b // Pad v0-v1
+
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ st1 {v4.16b}, [x0], #16
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #1 // topleft -= 1
+ movi v29.16b, #2
+ ld1 {v0.16b}, [x2], #16
+ movi v30.16b, #4
+ movi v31.16b, #4
+ ins v0.b[0], v0.b[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f // if (end + 1 < 19)
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask, -1
+ sub w6, w3, #31
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b, v3.16b}, [x5] // padding_mask
+
+ ld1r {v28.16b}, [x6]
+ bit v0.16b, v28.16b, v2.16b // Pad v0-v1
+ bit v1.16b, v28.16b, v3.16b
+4:
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ mov v1.16b, v28.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_8bpc_neon, export=1
+ dup v0.16b, w1
+1:
+ subs w2, w2, #16
+ st1 {v0.16b}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_8bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.8b, v0.8b, v0.8b, #1 // top[base+1]
+ ext v3.8b, v2.8b, v2.8b, #1
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ ext v1.16b, v0.16b, v0.16b, #1 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #1
+ umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
+ umull v17.8h, v2.8b, v7.8b
+ umlal v17.8h, v3.8b, v5.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // top[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // top[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w3, w3, #16
+ umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
+ umull2 v19.8h, v0.16b, v6.16b
+ umlal2 v19.8h, v16.16b, v4.16b
+ umull v20.8h, v2.8b, v7.8b
+ umlal v20.8h, v17.8b, v5.8b
+ umull2 v21.8h, v2.16b, v7.16b
+ umlal2 v21.8h, v17.16b, v5.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // top[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.16b}, [x0], #16
+ subs w3, w3, #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8b, v0.8b, v0.8b // top[base+1]
+ uzp1 v0.8b, v0.8b, v0.8b // top[base]
+ uzp2 v3.8b, v2.8b, v2.8b
+ uzp1 v2.8b, v2.8b, v2.8b
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ uzp2 v1.16b, v0.16b, v0.16b // top[base+1]
+ uzp1 v0.16b, v0.16b, v0.16b // top[base]
+ uzp2 v3.16b, v2.16b, v2.16b
+ uzp1 v2.16b, v2.16b, v2.16b
+ umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
+ umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
+ umull v17.8h, v3.8b, v5.8b
+ umlal v17.8h, v2.8b, v7.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_8bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.16b}, [x1]
+ subs w2, w2, #16
+ rev64 v0.16b, v0.16b
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+const increments
+ .short 0, 1, 2, 3, 4, 5, 6, 7
+ .short 8, 9, 10, 11, 12, 13, 14, 15
+endconst
+
+// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const pixel *const left,
+// const int width, const int height,
+// const int dx, const int dy);
+function ipred_z2_fill1_8bpc_neon, export=1
+ clz w10, w4
+ adr x9, L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+ ldrh w10, [x9, w10, uxtw #1]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+
+ br x9
+40:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // Worst case height for w=4 is 16, but we need at least h+1 elements
+ ld1 {v0.16b, v1.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v29.8b, #2
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ // Cut corners here; only doing tbl over v0 here; we only
+ // seem to need the last pixel, from v1, after skipping to the
+ // left-only codepath below.
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
+ ext v5.8b, v4.8b, v4.8b, #1
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ trn1 v2.2s, v2.2s, v4.2s // top[base_x]
+ trn1 v3.2s, v3.2s, v5.2s // top[base_x+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 4b
+
+49:
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 49b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // Worst case height for w=8 is 32, but we need at least h+1 elements
+ ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+ add v29.8b, v29.8b, v17.8b // base_y + 1
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+
+ movi v24.8b, #2 // 2
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ // Cut corners here; only doing tbl over v0-v1 here; we only
+ // seem to need the last pixel, from v2, after skipping to the
+ // left-only codepath below.
+ tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
+
+ ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 8b
+
+89:
+ tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
+ tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 89b
+
+9:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add x11, x11, #16 // increments
+
+ dup v18.8h, w7 // -dy
+ movi v17.16b, #1
+ add x3, x3, #1 // Skip past left[0]
+
+ ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy
+ movi v25.16b, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+ add v18.8h, v19.8h, v18.8h
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+ xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
+
+ // Worst case height is 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
+ ld1r {v15.16b}, [x2] // left[0] == top[0]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v16.8h // (uint8_t)ypos
+ xtn2 v27.16b, v18.8h
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v18.8h, #6
+ mov v18.16b, v15.16b // left[0]
+ and v27.16b, v27.16b, v25.16b // frac_y
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbx v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2
+ add v29.16b, v29.16b, v17.16b // base_y + 1
+
+ sub v28.16b, v26.16b, v27.16b // 64 - frac_y
+
+ movi v24.16b, #2 // 2
+16:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 169f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw
+ add x11, x2, w11, sxtw
+
+ ld1 {v4.16b, v5.16b}, [x9] // top[base_x]
+ mov v19.16b, v15.16b // left[0]
+ ld1 {v6.16b, v7.16b}, [x11]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ mov v20.16b, v15.16b // left[0]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn v22.8b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn v17.8b, v17.8h
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ trn1 v21.2d, v21.2d, v21.2d // first base_x
+ trn1 v22.2d, v22.2d, v22.2d
+ trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos
+ trn1 v17.2d, v17.2d, v17.2d
+
+ ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+
+ sub v8.16b, v26.16b, v16.16b // 64 - frac_x
+ sub v9.16b, v26.16b, v17.16b
+
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+ add v22.16b, v22.16b, v31.16b
+
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v5.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v7.8b, v17.8b
+ umull2 v18.8h, v6.16b, v9.16b
+ umlal2 v18.8h, v7.16b, v17.16b
+
+ cmge v21.16b, v21.16b, #0
+ cmge v22.16b, v22.16b, #0
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v18.8h, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.16b}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.16b}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2
+ add v30.16b, v30.16b, v24.16b // base_y += 2
+ b 16b
+
+169:
+ mov v19.16b, v15.16b
+ mov v20.16b, v15.16b
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v5.8h, v18.16b, v28.16b
+ umlal2 v5.8h, v19.16b, v27.16b
+ umull v6.8h, v19.8b, v28.8b
+ umlal v6.8h, v20.8b, v27.8b
+ umull2 v7.8h, v19.16b, v28.16b
+ umlal2 v7.8h, v20.16b, v27.16b
+
+ rshrn v4.8b, v4.8h, #6
+ rshrn2 v4.16b, v5.8h, #6
+ rshrn v5.8b, v6.8h, #6
+ rshrn2 v5.16b, v7.8h, #6
+
+ st1 {v4.16b}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.16b}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2
+ add v30.16b, v30.16b, v24.16b // base_y += 2
+ b 169b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add x11, x11, #16 // increments
+
+ dup v25.8h, w7 // -dy
+ add x3, x3, #1 // Skip past left[0]
+
+ ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
+
+ add x13, x0, x1 // alternating row
+ lsl x1, x1, #1 // stride *= 2
+ sub x1, x1, w4, uxtw // stride -= width
+
+ movi v11.8h, #8
+ mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
+ add v26.8h, v26.8h, v25.8h // -= dy
+ mul v25.8h, v25.8h, v11.8h // -8*dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+ xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
+
+ // Worst case height is 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
+ ld1r {v15.16b}, [x2] // left[0] == top[0]
+
+ mov w12, w4 // orig w
+ neg w14, w4 // -w
+
+1:
+ mov v23.16b, v26.16b // reset ypos
+
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, w14 // base_x <= -w
+ asr w11, w8, #6 // base_x
+ b.le 329f
+
+ dup v17.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+
+ add x9, x2, w9, sxtw
+ add x11, x2, w11, sxtw
+
+ sqshrn v21.8b, v16.8h, #6 // first base_x
+ sqshrn v22.8b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn v17.8b, v17.8h
+
+ ld1 {v4.16b}, [x9], #16 // top[base_x]
+ ld1 {v6.16b}, [x11], #16
+
+ trn1 v21.2d, v21.2d, v21.2d // first base_x
+ trn1 v22.2d, v22.2d, v22.2d
+ trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos
+ trn1 v17.2d, v17.2d, v17.2d
+
+ movi v10.16b, #0x3e
+ movi v11.16b, #64
+
+ and v16.16b, v16.16b, v10.16b // frac_x
+ and v17.16b, v17.16b, v10.16b
+
+ sub v8.16b, v11.16b, v16.16b // 64 - frac_x
+ sub v9.16b, v11.16b, v17.16b
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+ add v22.16b, v22.16b, v31.16b
+
+2:
+ add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
+ movi v12.16b, #64
+ movi v20.16b, #2
+ movi v10.16b, #0x3e
+
+ smov w10, v22.b[0]
+
+ xtn v27.8b, v23.8h // (uint8_t)ypos
+ xtn2 v27.16b, v13.8h
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v13.8h, #6
+ cmp w10, #0 // base_x (bottom left) >= 0
+ and v27.16b, v27.16b, v10.16b // frac_y
+
+ mov v18.16b, v15.16b // left[0]
+
+ b.ge 4f
+
+ add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
+ movi v13.16b, #1
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v13.16b // base_y + 1
+ mov v19.16b, v15.16b // left[0]
+
+ sub v28.16b, v12.16b, v27.16b // 64 - frac_y
+
+ ld1 {v5.16b}, [x9], #16 // top[base_x]
+ ld1 {v7.16b}, [x11], #16
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v13.16b // base_y + 2
+
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #1
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v18.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v19.8b, v17.8b
+ umull2 v20.8h, v6.16b, v9.16b
+ umlal2 v20.8h, v19.16b, v17.16b
+
+ cmge v18.16b, v21.16b, #0
+ cmge v19.16b, v22.16b, #0
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v20.8h, #6
+
+ bit v10.16b, v12.16b, v18.16b
+ bit v11.16b, v13.16b, v19.16b
+
+ st1 {v10.16b}, [x0], #16
+ subs w4, w4, #16
+ st1 {v11.16b}, [x13], #16
+ b.le 3f
+
+ movi v10.16b, #16
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ add v21.16b, v21.16b, v10.16b // base_x += 16
+ add v22.16b, v22.16b, v10.16b
+ b 2b
+
+3:
+ subs w5, w5, #2
+ b.le 9f
+ movi v10.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w4, w12 // reset w
+ add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
+ b 1b
+
+4: // The rest of the row only predicted from top[]
+ ld1 {v5.16b}, [x9], #16 // top[base_x]
+ ld1 {v7.16b}, [x11], #16
+
+ ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #1
+
+ umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
+ umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v13.8h, v4.16b, v8.16b
+ umlal2 v13.8h, v18.16b, v16.16b
+ umull v14.8h, v6.8b, v9.8b
+ umlal v14.8h, v19.8b, v17.8b
+ umull2 v20.8h, v6.16b, v9.16b
+ umlal2 v20.8h, v19.16b, v17.16b
+
+ rshrn v12.8b, v12.8h, #6
+ rshrn2 v12.16b, v13.8h, #6
+ rshrn v13.8b, v14.8h, #6
+ rshrn2 v13.16b, v20.8h, #6
+
+ st1 {v12.16b}, [x0], #16
+ subs w4, w4, #16
+ st1 {v13.16b}, [x13], #16
+ b.le 3b
+
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ b 4b
+
+329: // The rest of the block only predicted from left[]
+ add x1, x1, w4, uxtw // restore stride
+ mov w12, w5 // orig remaining h
+1:
+ add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
+ movi v12.16b, #64
+ movi v10.16b, #0x3e
+
+ xtn v27.8b, v23.8h // (uint8_t)ypos
+ xtn2 v27.16b, v13.8h
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ shrn2 v29.16b, v13.8h, #6
+ and v27.16b, v27.16b, v10.16b // frac_y
+
+ mov v18.16b, v15.16b // left[0]
+ add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
+ movi v21.16b, #1
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v21.16b // base_y + 1
+
+ sub v28.16b, v12.16b, v27.16b // 64 - frac_y
+2:
+ mov v19.16b, v15.16b // left[0]
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v21.16b // base_y + 2
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+ add v29.16b, v29.16b, v21.16b // next base_y
+
+ umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v11.8h, v18.16b, v28.16b
+ umlal2 v11.8h, v19.16b, v27.16b
+ umull v12.8h, v19.8b, v28.8b
+ umlal v12.8h, v20.8b, v27.8b
+ umull2 v13.8h, v19.16b, v28.16b
+ umlal2 v13.8h, v20.16b, v27.16b
+
+ rshrn v10.8b, v10.8h, #6
+ rshrn2 v10.16b, v11.8h, #6
+ rshrn v11.8b, v12.8h, #6
+ rshrn2 v11.16b, v13.8h, #6
+
+ st1 {v10.16b}, [x0], x1
+ subs w5, w5, #2
+ st1 {v11.16b}, [x13], x1
+ b.le 3f
+ mov v18.16b, v20.16b
+ b 2b
+
+3:
+ subs w4, w4, #16
+ b.le 9f
+
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w5, w12 // reset h
+ b 1b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+L(ipred_z2_fill1_tbl):
+ .hword L(ipred_z2_fill1_tbl) - 640b
+ .hword L(ipred_z2_fill1_tbl) - 320b
+ .hword L(ipred_z2_fill1_tbl) - 160b
+ .hword L(ipred_z2_fill1_tbl) - 80b
+ .hword L(ipred_z2_fill1_tbl) - 40b
+endfunc
+
+function ipred_z2_fill2_8bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(2 << 6) // xpos = 2 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v29.8b, #2
+ add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6}
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1]
+ uzp1 v2.8b, v2.8b, v4.8b // top[base_x]
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 4b
+
+49:
+ tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
+
+ trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ ext v16.8b, v17.8b, v17.8b, #4
+ add v30.8b, v30.8b, v29.8b // base_y += 2
+ b 49b
+
+9:
+ ret
+
+80:
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+ add v29.8b, v29.8b, v17.8b // base_y + 1
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+
+ movi v24.8b, #2 // 2
+ add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
+
+ uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1]
+ uzp1 v4.16b, v4.16b, v6.16b // top[base_x]
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 8b
+
+89:
+ tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
+ tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull v17.8h, v19.8b, v28.8b
+ umlal v17.8h, v20.8b, v27.8b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ mov v18.8b, v20.8b
+ add v29.8b, v29.8b, v24.8b // base_y += 2
+ add v30.8b, v30.8b, v24.8b // base_y += 2
+ b 89b
+
+9:
+ ret
+endfunc
+
+function ipred_z2_fill3_8bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.16b, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3}
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.16b, v1.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1
+ add v28.8b, v29.8b, v19.8b // base_y + 2
+
+ trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3}
+
+ add v24.8b, v30.8b, v19.8b // base_y + 3
+
+ trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2
+ trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ trn1 v27.2s, v27.2s, v27.2s // frac_y
+ trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y
+
+ movi v24.8b, #4
+4:
+ asr w9, w8, #6 // base_x
+ dup v6.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ dup v7.4h, w8 // xpos
+
+ ldr d2, [x2, w9, sxtw] // top[base_x]
+ ldr d4, [x2, w11, sxtw]
+
+ trn1 v6.2d, v6.2d, v7.2d // xpos
+
+ tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
+
+ shrn v20.8b, v6.8h, #6 // first base_x for each row
+ xtn v6.8b, v6.8h // (uint8_t)xpos
+
+ ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
+ ext v5.8b, v4.8b, v4.8b, #1
+
+ and v6.8b, v6.8b, v25.8b // frac_x
+
+ trn1 v2.2s, v2.2s, v4.2s // top[base_x]
+ trn1 v3.2s, v3.2s, v5.2s // top[base_x+1]
+
+ sub v7.8b, v26.8b, v6.8b // 64 - frac_x
+
+ add v20.8b, v20.8b, v31.8b // actual base_x
+
+ umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+
+ umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
+
+ cmge v20.8b, v20.8b, #0
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v22.8b, v22.8h, #6
+
+ bit v16.8b, v22.8b, v20.8b
+
+ st1 {v16.s}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ add v29.8b, v29.8b, v24.8b // base_y += 4
+ add v30.8b, v30.8b, v24.8b // base_y += 4
+ b 4b
+
+49:
+ tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
+ tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
+
+ umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
+ umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
+ rshrn v18.8b, v18.8h, #6
+
+ st1 {v18.s}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v18.s}[1], [x0], x1
+ b.le 9f
+
+ add v29.8b, v29.8b, v24.8b // base_y += 4
+ add v30.8b, v30.8b, v24.8b // base_y += 4
+ b 49b
+
+9:
+ ret
+
+80:
+ dup v30.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.16b, #0x3e
+ add v30.8h, v16.8h, v30.8h // -= dy
+
+ xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[]
+
+ movi v26.16b, #64
+ movi v19.16b, #2
+
+ xtn v27.8b, v30.8h // (uint8_t)ypos
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v27.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ add v28.8b, v29.8b, v17.8b // base_y + 1
+ add v30.8b, v29.8b, v19.8b // base_y + 2
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
+ add v24.8b, v28.8b, v19.8b // base_y + 3
+
+ trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2
+ trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3
+
+ sub v28.8b, v26.8b, v27.8b // 64 - frac_y
+
+ movi v24.16b, #4
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ shrn v21.8b, v16.8h, #6 // first base_x
+ shrn2 v21.16b, v17.8h, #6
+ xtn v16.8b, v16.8h // (uint8_t)xpos
+ xtn2 v16.16b, v17.8h
+
+ ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #1
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v7.16b, v26.16b, v16.16b // 64 - frac_x
+
+ add v21.16b, v21.16b, v31.16b // actual base_x
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v17.8h, v18.16b, v28.16b
+ umlal2 v17.8h, v19.16b, v27.16b
+
+ umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
+ umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
+ umull2 v23.8h, v4.16b, v7.16b
+ umlal2 v23.8h, v5.16b, v16.16b
+
+ cmge v21.16b, v21.16b, #0
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+ rshrn v22.8b, v22.8h, #6
+ rshrn2 v22.16b, v23.8h, #6
+
+ bit v6.16b, v22.16b, v21.16b
+
+ st1 {v6.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4
+ add v30.16b, v30.16b, v24.16b // base_y += 4
+ b 8b
+
+89:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
+ umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
+ umull2 v17.8h, v18.16b, v28.16b
+ umlal2 v17.8h, v19.16b, v27.16b
+
+ rshrn v6.8b, v6.8h, #6
+ rshrn2 v6.16b, v17.8h, #6
+
+ st1 {v6.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v6.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4
+ add v30.16b, v30.16b, v24.16b // base_y += 4
+ b 89b
+
+9:
+ ret
+endfunc
+
+
+// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_8bpc_neon, export=1
+ cmp w6, #64
+ clz w9, w3
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ mov w7, w5
+ b.gt L(ipred_z3_fill1_large_h16)
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
+
+ trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+1:
+ mov v5.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
+
+ trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v4.8b, v5.8b, v5.8b, #4
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
+ ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
+1:
+ mov v5.8b, v31.8b
+ mov v6.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
+ tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull v17.8h, v5.8b, v25.8b
+ umlal v17.8h, v6.8b, v24.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ mov v4.8b, v6.8b
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ uqadd v28.8b, v28.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+ add v28.8h, v28.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ add v29.8h, v28.8h, v29.8h // ypos + 8*dy
+
+ xtn v24.8b, v28.8h // (uint8_t)ypos
+ xtn2 v24.16b, v29.8h
+ uqshrn v26.8b, v28.8h, #6 // base
+ uqshrn2 v26.16b, v29.8h, #6
+ and v24.16b, v24.16b, v23.16b // frac
+
+ mov v4.16b, v31.16b
+ uqadd v27.16b, v26.16b, v20.16b // base + 1
+ uqadd v28.16b, v26.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
+1:
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.16b}, [x0], x1
+ b.le 9f
+
+ mov v4.16b, v6.16b
+ uqadd v27.16b, v27.16b, v21.16b // base += 2
+ uqadd v28.16b, v28.16b, v21.16b // base += 2
+ b 1b
+
+9:
+ ret
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+ mov w12, w3
+
+ add x13, x0, x1
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ add v30.8h, v28.8h, v30.8h // ypos
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+1:
+ mov v26.16b, v30.16b // reset ypos
+
+2:
+ add v27.8h, v26.8h, v29.8h // ypos + 8*dy
+ uqshrn v16.8b, v26.8h, #6 // base
+ uqshrn2 v16.16b, v27.8h, #6
+ xtn v24.8b, v26.8h // (uint8_t)ypos
+ xtn2 v24.16b, v27.8h
+ umov w14, v16.b[0]
+ and v24.16b, v24.16b, v23.16b // frac
+
+ uqadd v17.16b, v16.16b, v20.16b // base + 1
+ cmp w14, w6 // base >= max_base_y
+ uqadd v18.16b, v16.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ b.ge 4f
+
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
+
+ subs w3, w3, #16
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ add v26.8h, v27.8h, v29.8h // ypos += 16*dy
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ movi v16.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2
+ mov w3, w12
+ b 1b
+
+4:
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 4b
+ b 3b
+
+9:
+ ret
+
+L(ipred_z3_fill1_large_h16):
+ // Fallback case for max_base_y > 64; similar to the z1
+ // implementation. This does the filtering vertically, filling out
+ // a 2x pixel column at a time.
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // left[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // left[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w4, w4, #16
+ umull v18.8h, v16.8b, v4.8b // left[base+1]*frac
+ umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac)
+ umull2 v19.8h, v16.16b, v4.16b
+ umlal2 v19.8h, v0.16b, v6.16b
+ umull v20.8h, v17.8b, v5.8b
+ umlal v20.8h, v2.8b, v7.8b
+ umull2 v21.8h, v17.16b, v5.16b
+ umlal2 v21.8h, v2.16b, v7.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ zip1 v18.16b, v16.16b, v17.16b
+ zip2 v19.16b, v16.16b, v17.16b
+ st1 {v18.h}[0], [x0], x1
+ st1 {v18.h}[1], [x13], x1
+ st1 {v18.h}[2], [x0], x1
+ st1 {v18.h}[3], [x13], x1
+ st1 {v18.h}[4], [x0], x1
+ st1 {v18.h}[5], [x13], x1
+ st1 {v18.h}[6], [x0], x1
+ st1 {v18.h}[7], [x13], x1
+ st1 {v19.h}[0], [x0], x1
+ st1 {v19.h}[1], [x13], x1
+ st1 {v19.h}[2], [x0], x1
+ st1 {v19.h}[3], [x13], x1
+ st1 {v19.h}[4], [x0], x1
+ st1 {v19.h}[5], [x13], x1
+ st1 {v19.h}[6], [x0], x1
+ st1 {v19.h}[7], [x13], x1
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // left[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #16
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.h}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.h}[0], [x13], x1
+ st1 {v31.h}[0], [x0], x1
+ st1 {v31.h}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8b}, [x13], x1
+ st1 {v31.8b}, [x0], x1
+ st1 {v31.8b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+16:
+32:
+64:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.16b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.16b}, [x13], x1
+ st1 {v31.16b}, [x0], x1
+ st1 {v31.16b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #16
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 16.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw
+1:
+ ands w5, w3, #15
+ b.eq 2f
+ // If the width isn't aligned to 16, first do one 16 byte write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.16b}, [x0]
+ add x0, x0, w5, uxtw
+2:
+ // Fill the rest of the line with aligned 16 byte writes.
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ b.eq 80f
+
+40: // w == 4
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2
+ trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.8b, v31.8b
+ mov v5.8b, v31.8b
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ uqadd v26.8b, v26.8b, v21.8b // base += 4
+ uqadd v27.8b, v27.8b, v21.8b // base += 4
+ b 1b
+
+9:
+ ret
+
+80: // w == 8
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2d, v24.2d, v24.2d // frac
+ trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2
+ trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3
+ trn1 v25.2d, v25.2d, v25.2d // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ uqadd v26.16b, v26.16b, v21.16b // base += 4
+ uqadd v27.16b, v27.16b, v21.16b // base += 4
+ b 1b
+
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur s0, [x2, #1] // top (0-3)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-3)
+4:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.8b, v2.8h, #4
+ subs w4, w4, #2
+ st1 {v2.s}[0], [x0], x1
+ uxtl v0.8h, v2.8b
+ st1 {v2.s}[1], [x6], x1
+ ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3]
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #1] // top (0-7)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-7)
+8:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.8b, v2.8h, #4
+ uxtl v1.8h, v2.8b // first block, in 16 bit
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.8b, v3.8h, #4
+ subs w4, w4, #2
+ st2 {v2.s, v3.s}[0], [x0], x1
+ zip2 v0.2s, v2.2s, v3.2s
+ st2 {v2.s, v3.s}[1], [x6], x1
+ uxtl v0.8h, v0.8b
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #1
+ sub x2, x2, #2
+ mov x7, #-2
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ uxtl v0.8h, v0.8b // left (0-1) + topleft (2)
+2:
+ ld1 {v2.16b}, [x8], #16 // top(0-15)
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ uxtl v1.8h, v2.8b // top(0-7)
+ uxtl2 v2.8h, v2.16b // top(8-15)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.8b, v3.8h, #4
+ uxtl v0.8h, v3.8b // first block, in 16 bit
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.8b, v4.8h, #4
+ uxtl v0.8h, v4.8b // second block, in 16 bit
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.8b, v5.8h, #4
+ uxtl v0.8h, v5.8b // third block, in 16 bit
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.8b, v6.8h, #4
+
+ st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
+ st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
+ b.le 8f
+ ins v0.h[2], v2.h[7]
+ ins v0.b[0], v6.b[7]
+ ins v0.b[2], v6.b[3]
+ b 2b
+8:
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter_tbl):
+ .hword L(ipred_filter_tbl) - 320b
+ .hword L(ipred_filter_tbl) - 160b
+ .hword L(ipred_filter_tbl) - 80b
+ .hword L(ipred_filter_tbl) - 40b
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ ld1 {v0.8b}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ movi v31.16b, #7
+ ldrh w9, [x6, w9, uxtw #1]
+ sub x6, x6, w9, uxtw
+ add x2, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8b}, [x3], #8
+ subs w5, w5, #4
+ ushr v3.8b, v1.8b, #4
+ and v2.8b, v1.8b, v31.8b
+ zip1 v1.16b, v2.16b, v3.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x2], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[3], [x2], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ ushr v4.16b, v1.16b, #4
+ and v3.16b, v1.16b, v31.16b
+ zip1 v1.16b, v3.16b, v4.16b
+ zip2 v2.16b, v3.16b, v4.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.d}[0], [x0], x1
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.d}[1], [x2], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x2], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x3], #32
+ subs w5, w5, #4
+ ushr v5.16b, v1.16b, #4
+ and v4.16b, v1.16b, v31.16b
+ ushr v7.16b, v2.16b, #4
+ and v6.16b, v2.16b, v31.16b
+ zip1 v1.16b, v4.16b, v5.16b
+ zip2 v2.16b, v4.16b, v5.16b
+ zip1 v3.16b, v6.16b, v7.16b
+ tbl v1.16b, {v0.16b}, v1.16b
+ zip2 v4.16b, v6.16b, v7.16b
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.16b}, [x0], x1
+ tbl v3.16b, {v0.16b}, v3.16b
+ st1 {v2.16b}, [x2], x1
+ tbl v4.16b, {v0.16b}, v4.16b
+ st1 {v3.16b}, [x0], x1
+ st1 {v4.16b}, [x2], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ subs w5, w5, #4
+ ushr v21.16b, v16.16b, #4
+ and v20.16b, v16.16b, v31.16b
+ ushr v23.16b, v17.16b, #4
+ and v22.16b, v17.16b, v31.16b
+ ushr v25.16b, v18.16b, #4
+ and v24.16b, v18.16b, v31.16b
+ ushr v27.16b, v19.16b, #4
+ and v26.16b, v19.16b, v31.16b
+ zip1 v16.16b, v20.16b, v21.16b
+ zip2 v17.16b, v20.16b, v21.16b
+ zip1 v18.16b, v22.16b, v23.16b
+ zip2 v19.16b, v22.16b, v23.16b
+ zip1 v20.16b, v24.16b, v25.16b
+ zip2 v21.16b, v24.16b, v25.16b
+ tbl v16.16b, {v0.16b}, v16.16b
+ zip1 v22.16b, v26.16b, v27.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ zip2 v23.16b, v26.16b, v27.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ tbl v20.16b, {v0.16b}, v20.16b
+ st1 {v16.16b, v17.16b}, [x0], x1
+ tbl v21.16b, {v0.16b}, v21.16b
+ st1 {v18.16b, v19.16b}, [x2], x1
+ tbl v22.16b, {v0.16b}, v22.16b
+ st1 {v20.16b, v21.16b}, [x0], x1
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v22.16b, v23.16b}, [x2], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ subs w5, w5, #2
+ ushr v21.16b, v16.16b, #4
+ and v20.16b, v16.16b, v31.16b
+ ushr v23.16b, v17.16b, #4
+ and v22.16b, v17.16b, v31.16b
+ ushr v25.16b, v18.16b, #4
+ and v24.16b, v18.16b, v31.16b
+ ushr v27.16b, v19.16b, #4
+ and v26.16b, v19.16b, v31.16b
+ zip1 v16.16b, v20.16b, v21.16b
+ zip2 v17.16b, v20.16b, v21.16b
+ zip1 v18.16b, v22.16b, v23.16b
+ zip2 v19.16b, v22.16b, v23.16b
+ zip1 v20.16b, v24.16b, v25.16b
+ zip2 v21.16b, v24.16b, v25.16b
+ tbl v16.16b, {v0.16b}, v16.16b
+ zip1 v22.16b, v26.16b, v27.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ zip2 v23.16b, v26.16b, v27.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ tbl v20.16b, {v0.16b}, v20.16b
+ tbl v21.16b, {v0.16b}, v21.16b
+ tbl v22.16b, {v0.16b}, v22.16b
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 64b
+ .hword L(pal_pred_tbl) - 32b
+ .hword L(pal_pred_tbl) - 16b
+ .hword L(pal_pred_tbl) - 8b
+ .hword L(pal_pred_tbl) - 4b
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ movi v0.8h, #128 // dc
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ cmlt v4.8h, v2.8h, #0 // sign
+ cmlt v5.8h, v3.8h, #0
+ add v2.8h, v2.8h, v4.8h // diff + sign
+ add v3.8h, v3.8h, v5.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [x0], x1
+ st1 {v2.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v3.s}[0], [x0], x1
+ st1 {v3.s}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ subs w3, w3, #16
+ st1 {v2.8b, v3.8b}, [x0], #16
+ st1 {v4.8b, v5.8b}, [x6], #16
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #1
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.8h, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ add x2, x2, #1
+ uaddlv h0, v0.8b
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.s}[0], [x2]
+ ins v2.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32
+ cmp w4, #4
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add x2, x2, #1
+ add v0.4h, v2.4h, v3.4h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v2.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ add v16.8h, v16.8h, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 2b
+3:
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ uaddlv s0, v0.8h // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ add v0.8h, v0.8h, v1.8h
+ add v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ add v16.4h, v16.4h, v0.4h
+ add v17.4h, v17.4h, v1.4h
+ add v18.4h, v18.4h, v2.4h
+ add v19.4h, v19.4h, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v18.8h, v18.8h, v0.8h
+ add v19.8h, v19.8h, v1.8h
+ b.gt 2b
+3:
+
+L(ipred_cfl_ac_420_w8_calc_subtract_dc):
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ add v2.8h, v18.8h, v19.8h
+ uaddlp v0.4s, v0.8h
+ uaddlp v2.4s, v2.8h
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v4.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b, v5.16b}, [x1], x2
+ uaddlp v1.8h, v1.16b
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v5.8h, v5.16b
+ uaddlp v6.8h, v6.16b
+ uaddlp v7.8h, v7.16b
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v1.8h, #1
+ shl v2.8h, v4.8h, #1
+ shl v3.8h, v5.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ ldr d5, [x1, #16]
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v3.4h, v3.8b
+ ldr d7, [x10, #16]
+ uaddlp v2.8h, v2.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v5.4h, v5.8b
+ uaddlp v4.8h, v4.16b
+ uaddlp v7.4h, v7.8b
+ uaddlp v6.8h, v6.16b
+ add v1.4h, v1.4h, v3.4h
+ add v0.8h, v0.8h, v2.8h
+ add v5.4h, v5.4h, v7.4h
+ add v4.8h, v4.8h, v6.8h
+ shl v1.4h, v1.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v5.4h, #1
+ shl v2.8h, v4.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v6.8h, v6.16b
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ uaddlp v0.4h, v0.8b
+ ld1 {v6.8b}, [x10], x2
+ uaddlp v2.4h, v2.8b
+ uaddlp v4.4h, v4.8b
+ uaddlp v6.4h, v6.8b
+ add v0.4h, v0.4h, v2.4h
+ add v4.4h, v4.4h, v6.4h
+ shl v0.4h, v0.4h, #1
+ shl v2.4h, v4.4h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w8 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.8b}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ld1 {v2.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ uaddlp v0.8h, v0.16b
+ uaddlp v3.4h, v3.8b
+ uaddlp v2.8h, v2.16b
+ shl v1.4h, v1.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v3.4h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ uaddlp v0.4h, v0.8b
+ uaddlp v2.4h, v2.8b
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v0.s}[1], [x10], x2
+ ld1 {v1.s}[0], [x1], x2
+ ld1 {v1.s}[1], [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v1.8h, v1.8b, #3
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ushll v0.8h, v0.8b, #3
+ ld1 {v3.8b}, [x10], x2
+ ushll v1.8h, v1.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v3.8h, v3.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ ushll2 v1.8h, v0.16b, #3
+ ushll v0.8h, v0.8b, #3
+ ld1 {v6.16b}, [x10], x2
+ ushll2 v3.8h, v2.16b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll2 v5.8h, v4.16b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll2 v7.8h, v6.16b, #3
+ ushll v6.8h, v6.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ ld1 {v6.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll v6.8h, v6.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ dup v5.8h, v4.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ ushll v0.8h, v2.8b, #3
+ ushll2 v1.8h, v2.16b, #3
+ ushll v2.8h, v3.8b, #3
+ ushll2 v3.8h, v3.16b, #3
+ ushll v4.8h, v6.8b, #3
+ ushll2 v5.8h, v6.16b, #3
+ ushll v6.8h, v7.8b, #3
+ ushll2 v7.8h, v7.16b, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ldr d2, [x1, #16]
+ ld1 {v1.16b}, [x1], x2
+ ldr d6, [x10, #16]
+ ld1 {v5.16b}, [x10], x2
+ ushll v2.8h, v2.8b, #3
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v6.8h, v6.8b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v3.8h, v2.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v1.16b}, [x1], x2
+ ld1 {v5.16b}, [x10], x2
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ dup v6.8h, v5.h[7]
+ dup v7.8h, v5.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v4.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v4.8h, v4.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ dup v5.8h, v4.h[7]
+ dup v6.8h, v4.h[7]
+ dup v7.8h, v4.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w8 subtracting
+ lsl w6, w6, #2
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ uaddlp v0.4s, v16.8h
+ uaddlp v1.4s, v17.8h
+ uaddlp v2.4s, v18.8h
+ uaddlp v3.4s, v19.8h
+ add v0.4s, v0.4s, v1.4s
+ add v2.4s, v2.4s, v3.4s
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+ b L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S
new file mode 100644
index 0000000000..3f8cff9869
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@@ -0,0 +1,5674 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ dup v0.8h, w8
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ urshr v0.8h, v0.8h, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 160b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ sub x1, x1, #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #8
+ sub x5, x5, w3, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.4h}, [x0], x1
+ st1 {v2.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ stp q3, q3, [x0, #64]
+ stp q2, q2, [x6, #64]
+ stp q3, q3, [x0, #96]
+ stp q2, q2, [x6, #96]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ stp q1, q1, [x0, #64]
+ stp q0, q0, [x6, #64]
+ stp q1, q1, [x0, #96]
+ stp q0, q0, [x6, #96]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.4h, v0.h[0]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlp v0.4s, v0.8h
+ addv s0, v0.4s
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.4s, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.4h, v0.h[0]
+2:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+2:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2], #32
+ addp v0.8h, v0.8h, v1.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ uaddlv s1, v1.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ addp v3.8h, v3.8h, v4.8h
+ addp v1.8h, v1.8h, v3.8h
+ uaddlv s1, v1.8h
+ cmp w4, #32
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+ addp v3.8h, v3.8h, v4.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ addp v1.8h, v1.8h, v3.8h
+ addp v20.8h, v20.8h, v22.8h
+ addp v1.8h, v1.8h, v20.8h
+ uaddlv s1, v1.8h
+ cmp w4, #64
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 16/32
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x2]
+ add x8, x2, #2
+ sub x2, x2, #8
+ sub x5, x5, w9, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v2.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v4.8h, v16.8h // tldiff
+ sabd v23.8h, v4.8h, v17.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v2.8h, v17.8h
+ umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
+ umin v19.8h, v21.8h, v23.8h
+ cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v23.8h, v21.8h
+ cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v19.8h, v17.8h
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.8h}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+2:
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v1.8h
+ add v18.8h, v6.8h, v2.8h
+ add v19.8h, v6.8h, v3.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v5.8h, v18.8h
+ sabd v23.8h, v5.8h, v19.8h
+ sabd v24.8h, v4.8h, v16.8h // tldiff
+ sabd v25.8h, v4.8h, v17.8h
+ sabd v26.8h, v4.8h, v18.8h
+ sabd v27.8h, v4.8h, v19.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v1.8h, v17.8h
+ sabd v18.8h, v2.8h, v18.8h
+ sabd v19.8h, v3.8h, v19.8h
+ umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
+ umin v29.8h, v21.8h, v25.8h
+ umin v30.8h, v22.8h, v26.8h
+ umin v31.8h, v23.8h, v27.8h
+ cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v25.8h, v21.8h
+ cmge v22.8h, v26.8h, v22.8h
+ cmge v23.8h, v27.8h, v23.8h
+ cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v29.8h, v17.8h
+ cmge v18.8h, v30.8h, v18.8h
+ cmge v19.8h, v31.8h, v19.8h
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v23.8h}, [x0], #16
+ st1 {v22.8h}, [x6], #16
+ subs w3, w3, #8
+ st1 {v21.8h}, [x5], #16
+ st1 {v20.8h}, [x10], #16
+ b.le 8f
+ ld1 {v5.8h}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.8h}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x12] // bottom
+ add x8, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[3] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v0.8h, v7.8h
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v18.4h
+ smlal2 v23.4s, v6.8h, v18.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn v21.4h, v21.4s, #9
+ rshrn v22.4h, v22.4s, #9
+ rshrn v23.4h, v23.4s, #9
+ st1 {v20.4h}, [x0], x1
+ st1 {v21.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.4h}, [x0], x1
+ st1 {v23.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[7] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
+ smlal v22.4s, v2.4h, v7.4h
+ smlal2 v23.4s, v2.8h, v7.8h
+ smlal v24.4s, v1.4h, v7.4h
+ smlal2 v25.4s, v1.8h, v7.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v17.4h
+ smlal2 v23.4s, v6.8h, v17.8h
+ smlal v24.4s, v6.4h, v18.4h
+ smlal2 v25.4s, v6.8h, v18.8h
+ smlal v26.4s, v6.4h, v19.4h
+ smlal2 v27.4s, v6.8h, v19.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ ld1r {v5.8h}, [x12] // right
+ sub x2, x2, #4
+ mov x7, #-4
+ mov w9, w3
+ add v31.4h, v4.4h, v5.4h // bottom+right
+
+1:
+ ld2r {v0.8h, v1.8h}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v2.8h, v3.8h}, [x8], #32 // top
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v24.4s, v0.4h, v6.4h
+ smlal2 v25.4s, v0.8h, v6.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v2.8h, v16.8h
+ smlal v22.4s, v3.4h, v16.4h
+ smlal2 v23.4s, v3.8h, v16.8h
+ smlal v24.4s, v2.4h, v17.4h
+ smlal2 v25.4s, v2.8h, v17.8h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal2 v27.4s, v3.8h, v17.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw #1
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x8] // bottom
+ add x2, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v18.8h, v18.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v18.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v17.8h
+ sqrdmulh v22.8h, v6.8h, v18.8h
+ sqrdmulh v23.8h, v6.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+2:
+ ld1 {v2.8h, v3.8h}, [x2], #32 // top
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v16.8h
+ sqrdmulh v22.8h, v2.8h, v17.8h
+ sqrdmulh v23.8h, v3.8h, v17.8h
+ sqrdmulh v24.8h, v2.8h, v18.8h
+ sqrdmulh v25.8h, v3.8h, v18.8h
+ sqrdmulh v26.8h, v2.8h, v19.8h
+ sqrdmulh v27.8h, v3.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v4.8h
+ add v26.8h, v26.8h, v4.8h
+ add v27.8h, v27.8h, v4.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x8], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.8h}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v1.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v3.8h, v3.8h, v5.8h // left-right
+ sub v2.8h, v2.8h, v5.8h
+ sub v1.8h, v1.8h, v5.8h
+ sub v0.8h, v0.8h, v5.8h
+ sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v1.8h, v7.8h
+ sqrdmulh v23.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #8
+ mov x7, #-8
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ ushll v6.8h, v7.8b, #7 // weights_hor << 7
+ ushll2 v7.8h, v7.16b, #7
+ sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v2.8h, v6.8h
+ sqrdmulh v23.8h, v2.8h, v7.8h
+ sqrdmulh v24.8h, v1.8h, v6.8h
+ sqrdmulh v25.8h, v1.8h, v7.8h
+ sqrdmulh v26.8h, v0.8h, v6.8h
+ sqrdmulh v27.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ add v24.8h, v24.8h, v5.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v5.8h
+ add v27.8h, v27.8h, v5.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x10], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end,
+// const int bitdepth_max);
+function ipred_z1_upsample_edge_16bpc_neon, export=1
+ dup v30.8h, w4 // bitdepth_max
+ movrel x4, padding_mask
+ ld1 {v0.8h, v1.8h}, [x2] // in[]
+ add x5, x2, w3, uxtw #1 // in[end]
+ sub x4, x4, w3, uxtw #1
+
+ ld1r {v2.8h}, [x5] // padding
+ ld1 {v3.8h, v4.8h}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v2.16b, v3.16b // padded in[]
+ bit v1.16b, v2.16b, v4.16b
+
+ ext v4.16b, v0.16b, v1.16b, #2
+ ext v5.16b, v1.16b, v2.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #4
+ ext v7.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+
+ add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2]
+ add v19.8h, v5.8h, v7.8h
+ add v20.8h, v0.8h, v16.8h
+ add v21.8h, v1.8h, v17.8h
+ umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2])
+ umull2 v23.4s, v18.8h, v31.8h
+ umull v24.4s, v19.4h, v31.4h
+ umull2 v25.4s, v19.8h, v31.8h
+ usubw v22.4s, v22.4s, v20.4h
+ usubw2 v23.4s, v23.4s, v20.8h
+ usubw v24.4s, v24.4s, v21.4h
+ usubw2 v25.4s, v25.4s, v21.8h
+
+ sqrshrun v16.4h, v22.4s, #4
+ sqrshrun2 v16.8h, v23.4s, #4
+ sqrshrun v17.4h, v24.4s, #4
+ sqrshrun2 v17.8h, v25.4s, #4
+
+ smin v16.8h, v16.8h, v30.8h
+ smin v17.8h, v17.8h, v30.8h
+
+ zip1 v0.8h, v4.8h, v16.8h
+ zip2 v1.8h, v4.8h, v16.8h
+ zip1 v2.8h, v5.8h, v17.8h
+ zip2 v3.8h, v5.8h, v17.8h
+
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+
+ ret
+endfunc
+
+// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
+// const pixel *const in,
+// const int bitdepth_max);
+function ipred_z2_upsample_edge_16bpc_neon, export=1
+ dup v30.8h, w3 // bitdepth_max
+ // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
+ movrel x4, padding_mask
+ ld1 {v0.8h, v1.8h}, [x2] // in[]
+ add x5, x2, w1, uxtw #1 // in[sz]
+ sub x4, x4, w1, uxtw #1
+
+ ld1r {v3.8h}, [x2] // in[0] for padding
+ ld1r {v2.8h}, [x5] // padding
+ ld1 {v4.8h, v5.8h}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v2.16b, v4.16b // padded in[]
+ bit v1.16b, v2.16b, v5.16b
+
+ ext v4.16b, v3.16b, v0.16b, #14
+ ext v5.16b, v0.16b, v1.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #4
+
+ add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1]
+ add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2]
+ umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2])
+ umull2 v19.4s, v16.8h, v31.8h
+ usubw v18.4s, v18.4s, v17.4h
+ usubw2 v19.4s, v19.4s, v17.8h
+
+ sqrshrun v16.4h, v18.4s, #4
+ sqrshrun2 v16.8h, v19.4s, #4
+
+ add x5, x0, #2*16
+
+ smin v16.8h, v16.8h, v30.8h
+
+ zip1 v4.8h, v0.8h, v16.8h
+ zip2 v5.8h, v0.8h, v16.8h
+
+ st1 {v2.h}[0], [x5]
+ // In case sz=8, output one single pixel in out[16].
+ st1 {v4.8h, v5.8h}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .short 0, 4, 8, 0
+ .short 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_16bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -6
+ add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1)
+
+ ld1 {v31.s}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.8h}, [x2], #16
+
+ dup v30.8h, v31.h[0]
+ dup v31.8h, v31.h[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ sub w3, w3, #16
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #24
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h}, [x5] // padding_mask
+
+ ld1r {v2.8h}, [x6]
+ bit v0.16b, v2.16b, v3.16b // Pad v0-v1
+ bit v1.16b, v2.16b, v4.16b
+
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #2 // topleft -= 1 pixel
+ movi v29.8h, #2
+ ld1 {v0.8h}, [x2], #16
+ movi v30.8h, #4
+ movi v31.8h, #4
+ ins v0.h[0], v0.h[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f // if (end + 1 < 19)
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask, -2
+ sub w6, w3, #23
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
+
+ ld1r {v28.8h}, [x6]
+ bit v0.16b, v28.16b, v3.16b // Pad v0-v2
+ bit v1.16b, v28.16b, v4.16b
+ bit v2.16b, v28.16b, v5.16b
+4:
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ mov v1.16b, v28.16b
+ mov v2.16b, v28.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #8
+ st1 {v28.8h}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_16bpc_neon, export=1
+ dup v0.8h, w1
+1:
+ subs w2, w2, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_16bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // top[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w3, w3, #16
+ umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ st1 {v22.8h, v23.8h}, [x0], #32
+ st1 {v24.8h, v25.8h}, [x13], #32
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.8h}, [x0], #16
+ subs w3, w3, #8
+ st1 {v31.8h}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_16bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_16bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.8h}, [x1]
+ subs w2, w2, #8
+ rev64 v0.8h, v0.8h
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+const increments
+ .short 0, 1, 2, 3, 4, 5, 6, 7
+endconst
+
+// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const pixel *const left,
+// const int width, const int height,
+// const int dx, const int dy);
+function ipred_z2_fill1_16bpc_neon, export=1
+ clz w10, w4
+ adr x9, L(ipred_z2_fill1_tbl)
+ sub w10, w10, #25
+ ldrh w10, [x9, w10, uxtw #1]
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub x9, x9, w10, uxtw
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+
+ br x9
+40:
+ AARCH64_VALID_JUMP_TARGET
+
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // Worst case height for w=4 is 16, but we need at least h+1 elements
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v29.16b, #4
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ // Cut corners here; only doing tbl over v0-v1 here; we only
+ // seem to need the last pixel, from v2, after skipping to the
+ // left-only codepath below.
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 4b
+
+49:
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ add x3, x3, #2 // Skip past left[0]
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // Worst case height for w=8 is 32.
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
+ ld1r {v15.8h}, [x2] // left[0] == top[0]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ mov v18.16b, v15.16b // left[0]
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ movi v17.16b, #2
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbx v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
+ add v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ mov v19.16b, v15.16b // left[0]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ mov v20.16b, v15.16b // left[0]
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v21.8h, v21.8h, #0
+ cmge v22.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 8b
+
+89:
+ mov v19.16b, v15.16b
+ mov v20.16b, v15.16b
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v19.4h, v28.4h
+ umlal v6.4s, v20.4h, v27.4h
+ umull2 v7.4s, v19.8h, v28.8h
+ umlal2 v7.4s, v20.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v25.8h, w7 // -dy
+ add x3, x3, #2 // Skip past left[0]
+
+ add x13, x0, x1 // alternating row
+ lsl x1, x1, #1 // stride *= 2
+ sub x1, x1, w4, uxtw #1 // stride -= width
+
+ movi v11.8h, #8
+ mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
+ add v26.8h, v26.8h, v25.8h // -= dy
+ mul v25.8h, v25.8h, v11.8h // -8*dy
+
+ // Worst case height is 64, but we can only fit 32 pixels into
+ // v0-v3 usable within one tbx instruction. As long as base_y is
+ // up to 32, we use tbx.
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
+ ld1r {v15.8h}, [x2] // left[0] == top[0]
+
+ mov w12, w4 // orig w
+ neg w14, w4 // -w
+
+1:
+ mov v23.16b, v26.16b // reset ypos
+
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, w14 // base_x <= -2*w
+ asr w11, w8, #6 // base_x
+ b.le 169f
+
+ dup v17.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ ld1 {v4.8h}, [x9], #16 // top[base_x]
+ ld1 {v6.8h}, [x11], #16
+
+ movi v10.8h, #0x3e
+ movi v11.8h, #64
+
+ and v16.16b, v16.16b, v10.16b // frac_x
+ and v17.16b, v17.16b, v10.16b
+
+ sub v8.8h, v11.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v11.8h, v17.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+2:
+ smov w10, v22.h[0]
+
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ movi v12.8h, #64
+ cmp w10, #0 // base_x (bottom left) >= 0
+ smov w10, v29.b[0] // base_y[0]
+ movi v10.8h, #0x3e
+
+ b.ge 4f
+ and v27.16b, v23.16b, v10.16b // frac_y
+ cmp w10, #(32-3)
+
+ mov v18.16b, v15.16b // left[0]
+ sub v28.8h, v12.8h, v27.8h // 64 - frac_y
+ b.gt 22f
+
+21:
+ // base_y < 32, using tbx
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v11.8h, #1, lsl #8
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
+
+ movi v13.16b, #2
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+
+ add v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
+ mov v19.16b, v15.16b // left[0]
+
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+
+ add v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
+ mov v20.16b, v15.16b // left[0]
+
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+
+ b 23f
+
+22:
+ // base_y >= 32, using separate loads.
+ smov w15, v29.b[1]
+ smov w16, v29.b[2]
+ add x10, x3, w10, sxtw #1
+ smov w17, v29.b[3]
+ add x15, x3, w15, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[0], [x10]
+ smov w10, v29.b[4]
+ add x16, x3, w16, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[1], [x15]
+ smov w15, v29.b[5]
+ add x17, x3, w17, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[2], [x16]
+ smov w16, v29.b[6]
+ add x10, x3, w10, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[3], [x17]
+ smov w17, v29.b[7]
+ add x15, x3, w15, sxtw #1
+ add x16, x3, w16, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[4], [x10]
+ add x17, x3, w17, sxtw #1
+ ld3 {v18.h, v19.h, v20.h}[5], [x15]
+ ld3 {v18.h, v19.h, v20.h}[6], [x16]
+ ld3 {v18.h, v19.h, v20.h}[7], [x17]
+
+23:
+
+ ld1 {v5.8h}, [x9], #16 // top[base_x]
+ ld1 {v7.8h}, [x11], #16
+
+ add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #2
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v18.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v19.4h, v17.4h
+ umull2 v20.4s, v6.8h, v9.8h
+ umlal2 v20.4s, v19.8h, v17.8h
+
+ cmge v18.8h, v21.8h, #0
+ cmge v19.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v20.4s, #6
+
+ bit v10.16b, v12.16b, v18.16b
+ bit v11.16b, v13.16b, v19.16b
+
+ st1 {v10.8h}, [x0], #16
+ subs w4, w4, #8
+ st1 {v11.8h}, [x13], #16
+ b.le 3f
+
+ movi v10.8h, #8
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ add v21.8h, v21.8h, v10.8h // base_x += 8
+ add v22.8h, v22.8h, v10.8h
+ b 2b
+
+3:
+ subs w5, w5, #2
+ b.le 9f
+ movi v10.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w4, w12 // reset w
+ add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
+ b 1b
+
+4: // The rest of the row only predicted from top[]
+ ld1 {v5.8h}, [x9], #16 // top[base_x]
+ ld1 {v7.8h}, [x11], #16
+
+ ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v19.16b, v6.16b, v7.16b, #2
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v18.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v19.4h, v17.4h
+ umull2 v20.4s, v6.8h, v9.8h
+ umlal2 v20.4s, v19.8h, v17.8h
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v20.4s, #6
+
+ st1 {v12.8h}, [x0], #16
+ subs w4, w4, #8
+ st1 {v13.8h}, [x13], #16
+ b.le 3b
+
+ mov v4.16b, v5.16b
+ mov v6.16b, v7.16b
+ b 4b
+
+169: // The rest of the block only predicted from left[]
+ add x1, x1, w4, uxtw #1 // restore stride
+ mov w12, w5 // orig remaining h
+1:
+ movi v12.8h, #64
+ movi v10.8h, #0x3e
+
+ shrn v29.8b, v23.8h, #6 // ypos >> 6
+ and v27.16b, v23.16b, v10.16b // frac_y
+
+ smov w10, v29.b[0] // base_y[0]
+
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v11.8h, #1, lsl #8
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v23.8h, v23.8h, v25.8h // ypos -= 8*dy
+ add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
+
+ cmp w10, #(32-1)
+
+ mov v18.16b, v15.16b // left[0]
+ movi v21.16b, #2
+
+ sub v28.8h, v12.8h, v27.8h // 64 - frac_y
+
+ b.gt 31f
+
+ tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
+ add v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
+
+2:
+ // base_y < 32, using tbx.
+ smov w10, v29.b[0] // base_y[0]
+ mov v19.16b, v15.16b // left[0]
+ cmp w10, #(64-4)
+ b.gt 32f
+ tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
+ add v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
+ mov v20.16b, v15.16b // left[0]
+ tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
+ add v29.16b, v29.16b, v21.16b // next base_y
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v11.8h}, [x13], x1
+ b.le 4f
+ mov v18.16b, v20.16b
+ b 2b
+
+31: // base_y >= 32, using separate loads, loading v18 if we had to bail
+ // in the prologue.
+ smov w10, v29.b[0]
+ smov w15, v29.b[2]
+ movi v21.16b, #2
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld1 {v18.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld1 {v18.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld1 {v18.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld1 {v18.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld1 {v18.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld1 {v18.h}[5], [x15]
+ add v29.16b, v29.16b, v21.16b // next base_y
+ ld1 {v18.h}[6], [x16]
+ ld1 {v18.h}[7], [x17]
+
+32: // base_y >= 32, using separate loads.
+ cmp w5, #4
+ b.lt 34f
+33: // h >= 4, preserving v18 from the previous round, loading v19-v22.
+ smov w10, v29.b[0]
+ subs w5, w5, #4
+ smov w15, v29.b[2]
+ movi v10.16b, #8
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15]
+ ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16]
+ add v29.16b, v29.16b, v10.16b // next base_y
+ ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17]
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v13.4s, v20.8h, v28.8h
+ umlal2 v13.4s, v21.8h, v27.8h
+ umull v14.4s, v21.4h, v28.4h
+ umlal v14.4s, v22.4h, v27.4h
+ umull2 v18.4s, v21.8h, v28.8h
+ umlal2 v18.4s, v22.8h, v27.8h
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ cmp w5, #2
+ st1 {v11.8h}, [x13], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v13.8h}, [x13], x1
+ b.lt 4f
+ mov v18.16b, v22.16b
+ b.gt 33b
+
+34: // h == 2, preserving v18 from the previous round, loading v19-v20.
+ smov w10, v29.b[0]
+ smov w15, v29.b[2]
+ movi v21.16b, #4
+ smov w16, v29.b[4]
+ add x10, x3, w10, sxtw
+ smov w17, v29.b[6]
+ add x15, x3, w15, sxtw
+ ld2 {v19.h, v20.h}[0], [x10]
+ smov w10, v29.b[8]
+ add x16, x3, w16, sxtw
+ ld2 {v19.h, v20.h}[1], [x15]
+ smov w15, v29.b[10]
+ add x17, x3, w17, sxtw
+ ld2 {v19.h, v20.h}[2], [x16]
+ smov w16, v29.b[12]
+ add x10, x3, w10, sxtw
+ ld2 {v19.h, v20.h}[3], [x17]
+ smov w17, v29.b[14]
+ add x15, x3, w15, sxtw
+ add x16, x3, w16, sxtw
+ ld2 {v19.h, v20.h}[4], [x10]
+ add x17, x3, w17, sxtw
+ ld2 {v19.h, v20.h}[5], [x15]
+ ld2 {v19.h, v20.h}[6], [x16]
+ add v29.16b, v29.16b, v21.16b // next base_y
+ ld2 {v19.h, v20.h}[7], [x17]
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ st1 {v10.8h}, [x0], x1
+ st1 {v11.8h}, [x13], x1
+ // The h==2 case only happens once at the end, if at all.
+
+4:
+ subs w4, w4, #8
+ b.le 9f
+
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w5, w12 // reset h
+ b 1b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+
+L(ipred_z2_fill1_tbl):
+ .hword L(ipred_z2_fill1_tbl) - 640b
+ .hword L(ipred_z2_fill1_tbl) - 320b
+ .hword L(ipred_z2_fill1_tbl) - 160b
+ .hword L(ipred_z2_fill1_tbl) - 80b
+ .hword L(ipred_z2_fill1_tbl) - 40b
+endfunc
+
+function ipred_z2_fill2_16bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(2 << 6) // xpos = 2 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.8h, v1.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
+
+ trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v29.16b, #4
+ add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6}
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-8 // base_x <= -8
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1]
+ uzp1 v4.8h, v4.8h, v6.8h // top[base_x]
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 4b
+
+49:
+ tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
+
+ trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ ext v18.16b, v19.16b, v19.16b, #8
+ add v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
+ // from left.
+ ld1 {v0.8h, v1.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ movi v17.16b, #2
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ // Cut corners here; for the first row we don't expect to need to
+ // read outside of v0.
+ tbl v18.16b, {v0.16b}, v29.16b // left[base_y]
+
+ add v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
+ add v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+ add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
+
+ sshr v21.8h, v16.8h, #6 // first base_x
+ sshr v22.8h, v17.8h, #6
+
+ tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
+
+ uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1]
+ uzp1 v4.8h, v4.8h, v5.8h // top[base_x]
+ uzp2 v3.8h, v6.8h, v7.8h
+ uzp1 v6.8h, v6.8h, v7.8h
+ mov v5.16b, v2.16b
+ mov v7.16b, v3.16b
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v21.8h, v21.8h, v31.8h // actual base_x
+ add v22.8h, v22.8h, v31.8h
+
+ umull v12.4s, v19.4h, v28.4h
+ umlal v12.4s, v20.4h, v27.4h
+ umull2 v13.4s, v19.8h, v28.8h
+ umlal2 v13.4s, v20.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v21.8h, v21.8h, #0
+ cmge v22.8h, v22.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v21.16b
+ bit v11.16b, v13.16b, v22.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 8b
+
+89:
+ tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
+ tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v19.4h, v28.4h
+ umlal v6.4s, v20.4h, v27.4h
+ umull2 v7.4s, v19.8h, v28.8h
+ umlal2 v7.4s, v20.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ mov v18.16b, v20.16b
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+function ipred_z2_fill3_16bpc_neon, export=1
+ cmp w4, #8
+ mov w8, #(1 << 6) // xpos = 1 << 6
+ sub w8, w8, w6 // xpos -= dx
+
+ movrel x11, increments
+ ld1 {v31.8h}, [x11] // increments
+ neg w7, w7 // -dy
+ b.eq 80f
+
+40:
+ dup v30.4h, w7 // -dy
+ movi v17.8b, #1
+
+ mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy
+ movi v25.8h, #0x3e
+ add v30.4h, v16.4h, v30.4h // -= dy
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #2
+
+ shrn v29.8b, v30.8h, #6 // ypos >> 6
+ and v27.8b, v30.8b, v25.8b // frac_y
+
+ add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
+
+ movi v23.4h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ movi v19.16b, #4
+ zip1 v29.8b, v29.8b, v29.8b // duplicate elements
+ movi v17.8b, #2
+ add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ...
+
+ add v30.8b, v29.8b, v17.8b // base_y + 1 (*2)
+ add v28.8b, v29.8b, v19.8b // base_y + 2 (*2)
+
+ trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3}
+
+ add v24.8b, v30.8b, v19.8b // base_y + 3 (*2)
+
+ trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2
+ trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3
+
+ sub v28.4h, v26.4h, v27.4h // 64 - frac_y
+
+ trn1 v27.2d, v27.2d, v27.2d // frac_y
+ trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y
+
+ movi v24.16b, #8
+4:
+ asr w9, w8, #6 // base_x
+ dup v16.4h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-4 // base_x <= -4
+ asr w11, w8, #6 // base_x
+ b.le 49f
+
+ lsl w9, w9, #1
+ lsl w11, w11, #1
+
+ dup v17.4h, w8 // xpos
+
+ ldr q4, [x2, w9, sxtw] // top[base_x]
+ ldr q6, [x2, w11, sxtw]
+
+ trn1 v16.2d, v16.2d, v17.2d // xpos
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ sshr v20.8h, v16.8h, #6 // first base_x for each row
+
+ ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v6.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+
+ trn1 v4.2d, v4.2d, v6.2d // top[base_x]
+ trn1 v5.2d, v5.2d, v7.2d // top[base_x+1]
+
+ sub v17.8h, v26.8h, v16.8h // 64 - frac_x
+
+ add v20.8h, v20.8h, v31.8h // actual base_x
+
+ umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v22.4s, v18.8h, v28.8h
+ umlal2 v22.4s, v19.8h, v27.8h
+
+ umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x)
+ umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v24.4s, v4.8h, v17.8h
+ umlal2 v24.4s, v5.8h, v16.8h
+
+ cmge v20.8h, v20.8h, #0
+
+ rshrn v21.4h, v21.4s, #6
+ rshrn2 v21.8h, v22.4s, #6
+ rshrn v22.4h, v23.4s, #6
+ rshrn2 v22.8h, v24.4s, #6
+
+ movi v24.16b, #8
+
+ bit v21.16b, v22.16b, v20.16b
+
+ st1 {v21.d}[0], [x0], x1
+ sub w8, w8, w6 // xpos -= dx
+ subs w5, w5, #2
+ st1 {v21.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
+ b 4b
+
+49:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
+
+ umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v21.4s, v18.8h, v28.8h
+ umlal2 v21.4s, v19.8h, v27.8h
+
+ rshrn v20.4h, v20.4s, #6
+ rshrn2 v20.8h, v21.4s, #6
+
+ st1 {v20.d}[0], [x0], x1
+ subs w5, w5, #2
+ st1 {v20.d}[1], [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
+ add v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
+ b 49b
+
+9:
+ ret
+
+80:
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ dup v18.8h, w7 // -dy
+ movi v17.16b, #2
+
+ mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
+ movi v25.8h, #0x3e
+ add v16.8h, v16.8h, v18.8h // -= dy
+
+ // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
+ ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[]
+
+ movi v26.8h, #64
+ movi v19.16b, #4
+
+ shrn v29.8b, v16.8h, #6 // ypos >> 6
+ and v27.16b, v16.16b, v25.16b // frac_y
+
+ add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2
+
+ movi v23.8h, #1, lsl #8
+ shl v29.8b, v29.8b, #1 // 2*base_y
+ mov v18.16b, v15.16b // left[0]
+ zip1 v29.16b, v29.16b, v29.16b // duplicate elements
+ add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
+
+ add v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
+
+ sub v28.8h, v26.8h, v27.8h // 64 - frac_y
+
+ movi v24.16b, #4
+8:
+ asr w9, w8, #6 // base_x
+ dup v16.8h, w8 // xpos
+ sub w8, w8, w6 // xpos -= dx
+ cmp w9, #-16 // base_x <= -16
+ asr w11, w8, #6 // base_x
+ b.le 89f
+
+ dup v17.8h, w8 // xpos
+
+ add x9, x2, w9, sxtw #1
+ add x11, x2, w11, sxtw #1
+
+ ld1 {v4.8h, v5.8h}, [x9] // top[base_x]
+ ld1 {v6.8h, v7.8h}, [x11]
+
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
+ add v30.16b, v30.16b, v24.16b
+
+ sshr v22.8h, v16.8h, #6 // first base_x
+ tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
+ sshr v23.8h, v17.8h, #6
+ tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
+
+ ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1]
+ ext v7.16b, v6.16b, v7.16b, #2
+
+ and v16.16b, v16.16b, v25.16b // frac_x
+ and v17.16b, v17.16b, v25.16b
+
+ umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+
+ sub v8.8h, v26.8h, v16.8h // 64 - frac_x
+ sub v9.8h, v26.8h, v17.8h
+
+ umull2 v11.4s, v18.8h, v28.8h
+ umlal2 v11.4s, v19.8h, v27.8h
+
+ add v22.8h, v22.8h, v31.8h // actual base_x
+ add v23.8h, v23.8h, v31.8h
+
+ umull v12.4s, v20.4h, v28.4h
+ umlal v12.4s, v21.4h, v27.4h
+ umull2 v13.4s, v20.8h, v28.8h
+ umlal2 v13.4s, v21.8h, v27.8h
+
+ rshrn v10.4h, v10.4s, #6
+ rshrn2 v10.8h, v11.4s, #6
+ rshrn v11.4h, v12.4s, #6
+ rshrn2 v11.8h, v13.4s, #6
+
+ umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x)
+ umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x
+ umull2 v13.4s, v4.8h, v8.8h
+ umlal2 v13.4s, v5.8h, v16.8h
+ umull v14.4s, v6.4h, v9.4h
+ umlal v14.4s, v7.4h, v17.4h
+ umull2 v18.4s, v6.8h, v9.8h
+ umlal2 v18.4s, v7.8h, v17.8h
+
+ cmge v22.8h, v22.8h, #0
+ cmge v23.8h, v23.8h, #0
+
+ rshrn v12.4h, v12.4s, #6
+ rshrn2 v12.8h, v13.4s, #6
+ rshrn v13.4h, v14.4s, #6
+ rshrn2 v13.8h, v18.4s, #6
+
+ bit v10.16b, v12.16b, v22.16b
+ bit v11.16b, v13.16b, v23.16b
+
+ st1 {v10.8h}, [x0], x1
+ subs w5, w5, #2
+ sub w8, w8, w6 // xpos -= dx
+ st1 {v11.8h}, [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b
+ b 8b
+
+89:
+ tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
+ add v30.16b, v30.16b, v24.16b
+ tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
+ tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
+
+ umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y)
+ umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y
+ umull2 v5.4s, v18.8h, v28.8h
+ umlal2 v5.4s, v19.8h, v27.8h
+ umull v6.4s, v20.4h, v28.4h
+ umlal v6.4s, v21.4h, v27.4h
+ umull2 v7.4s, v20.8h, v28.8h
+ umlal2 v7.4s, v21.8h, v27.8h
+
+ rshrn v4.4h, v4.4s, #6
+ rshrn2 v4.8h, v5.4s, #6
+ rshrn v5.4h, v6.4s, #6
+ rshrn2 v5.8h, v7.4s, #6
+
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x0], x1
+ b.le 9f
+
+ add v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
+ add v30.16b, v30.16b, v24.16b
+ b 89b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_16bpc_neon, export=1
+ clz w9, w4
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // left[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // left[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w4, w4, #16
+ umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ zip1 v18.8h, v22.8h, v24.8h
+ zip2 v19.8h, v22.8h, v24.8h
+ zip1 v20.8h, v23.8h, v25.8h
+ zip2 v21.8h, v23.8h, v25.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x13], x1
+ st1 {v20.s}[2], [x0], x1
+ st1 {v20.s}[3], [x13], x1
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x13], x1
+ st1 {v21.s}[2], [x0], x1
+ st1 {v21.s}[3], [x13], x1
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // left[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #8
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.4h}, [x13], x1
+ st1 {v31.4h}, [x0], x1
+ st1 {v31.4h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+8:
+16:
+32:
+64:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8h}, [x13], x1
+ st1 {v31.8h}, [x0], x1
+ st1 {v31.8h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 8.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw #1
+1:
+ ands w5, w3, #7
+ b.eq 2f
+ // If the width isn't aligned to 8, first do one 8 pixel write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.8h}, [x0]
+ add x0, x0, w5, uxtw #1
+2:
+ // Fill the rest of the line with aligned 8 pixel writes.
+ subs w3, w3, #8
+ st1 {v31.8h}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_16bpc_neon, export=1
+ cmp w4, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ b.eq 8f
+
+4: // h == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+8: // h == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter\bpc\()_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ dup v31.8h, w8
+.if \bpc == 10
+ movi v30.8h, #0
+.endif
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #2] // top (0-3)
+ sub x2, x2, #4
+ mov x7, #-4
+4:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+.endif
+ smin v2.8h, v2.8h, v31.8h
+ subs w4, w4, #2
+ st1 {v2.d}[0], [x0], x1
+ ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
+ st1 {v2.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur q0, [x2, #2] // top (0-7)
+ sub x2, x2, #4
+ mov x7, #-4
+8:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+ smin v2.8h, v2.8h, v31.8h
+ smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
+ smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
+ smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.4h, v4.4s, #4
+ sqrshrun2 v3.8h, v5.4s, #4
+.endif
+ smin v3.8h, v3.8h, v31.8h
+ subs w4, w4, #2
+ st2 {v2.d, v3.d}[0], [x0], x1
+ zip2 v0.2d, v2.2d, v3.2d
+ st2 {v2.d, v3.d}[1], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #2
+ sub x2, x2, #4
+ mov x7, #-4
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
+2:
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+ smin v3.8h, v3.8h, v31.8h
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ srshr v4.8h, v4.8h, #4
+ smax v4.8h, v4.8h, v30.8h
+ smin v4.8h, v4.8h, v31.8h
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ srshr v5.8h, v5.8h, #4
+ smax v5.8h, v5.8h, v30.8h
+ smin v5.8h, v5.8h, v31.8h
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ srshr v6.8h, v6.8h, #4
+ smax v6.8h, v6.8h, v30.8h
+.else
+ smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
+ smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
+ smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.4h, v3.4s, #4
+ sqrshrun2 v3.8h, v4.4s, #4
+ smin v3.8h, v3.8h, v31.8h
+ smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
+ smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
+ smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.4h, v5.4s, #4
+ sqrshrun2 v4.8h, v6.4s, #4
+ smin v4.8h, v4.8h, v31.8h
+ smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
+ smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
+ smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.4h, v24.4s, #4
+ sqrshrun2 v5.8h, v25.4s, #4
+ smin v5.8h, v5.8h, v31.8h
+ smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
+ smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
+ smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.4h, v26.4s, #4
+ sqrshrun2 v6.8h, v27.4s, #4
+.endif
+ smin v6.8h, v6.8h, v31.8h
+
+ ins v0.h[2], v2.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+ ins v0.h[0], v6.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+ ins v0.h[1], v6.h[3]
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter\bpc\()_tbl):
+ .hword L(ipred_filter\bpc\()_tbl) - 320b
+ .hword L(ipred_filter\bpc\()_tbl) - 160b
+ .hword L(ipred_filter\bpc\()_tbl) - 80b
+ .hword L(ipred_filter\bpc\()_tbl) - 40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ ldr w8, [sp]
+ cmp w8, 0x3ff
+ b.le ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ ld1 {v30.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ movi v29.16b, #7
+ ldrh w9, [x6, w9, uxtw #1]
+ movi v31.8h, #1, lsl #8
+ sub x6, x6, w9, uxtw
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+4:
+ ld1 {v1.8b}, [x3], #8
+ subs w5, w5, #4
+ ushr v3.8b, v1.8b, #4
+ and v2.8b, v1.8b, v29.8b
+ zip1 v1.16b, v2.16b, v3.16b
+ // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ add v1.16b, v1.16b, v1.16b
+ zip1 v0.16b, v1.16b, v1.16b
+ zip2 v1.16b, v1.16b, v1.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ st1 {v0.d}[0], [x0], x1
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.d}[1], [x2], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x2], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+8:
+ ld1 {v2.16b}, [x3], #16
+ subs w5, w5, #4
+ ushr v4.16b, v2.16b, #4
+ and v3.16b, v2.16b, v29.16b
+ zip1 v2.16b, v3.16b, v4.16b
+ zip2 v3.16b, v3.16b, v4.16b
+ add v2.16b, v2.16b, v2.16b
+ add v3.16b, v3.16b, v3.16b
+ zip1 v0.16b, v2.16b, v2.16b
+ zip2 v1.16b, v2.16b, v2.16b
+ zip1 v2.16b, v3.16b, v3.16b
+ zip2 v3.16b, v3.16b, v3.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.8h}, [x0], x1
+ tbl v2.16b, {v30.16b}, v2.16b
+ st1 {v1.8h}, [x2], x1
+ tbl v3.16b, {v30.16b}, v3.16b
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x2], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+16:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #4
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ st1 {v2.8h, v3.8h}, [x2], x1
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h}, [x0], x1
+ st1 {v6.8h, v7.8h}, [x2], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+32:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #2
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, #64
+64:
+ ld1 {v4.16b, v5.16b}, [x3], #32
+ subs w5, w5, #1
+ ushr v7.16b, v4.16b, #4
+ and v6.16b, v4.16b, v29.16b
+ ushr v3.16b, v5.16b, #4
+ and v2.16b, v5.16b, v29.16b
+ zip1 v4.16b, v6.16b, v7.16b
+ zip2 v5.16b, v6.16b, v7.16b
+ zip1 v6.16b, v2.16b, v3.16b
+ zip2 v7.16b, v2.16b, v3.16b
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 640b
+ .hword L(pal_pred_tbl) - 320b
+ .hword L(pal_pred_tbl) - 160b
+ .hword L(pal_pred_tbl) - 80b
+ .hword L(pal_pred_tbl) - 40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ urshr v0.8h, v31.8h, #1
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #4
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x6], x1
+ st1 {v3.d}[0], [x0], x1
+ st1 {v3.d}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #2
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ subs w3, w3, #16
+ smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
+ smull2 v17.4s, v2.8h, v1.8h
+ smull v18.4s, v3.4h, v1.4h
+ smull2 v19.4s, v3.8h, v1.8h
+ smull v2.4s, v4.4h, v1.4h
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v20.4s, v16.4s, #0 // sign
+ cmlt v21.4s, v17.4s, #0
+ cmlt v22.4s, v18.4s, #0
+ cmlt v23.4s, v19.4s, #0
+ cmlt v24.4s, v2.4s, #0
+ cmlt v25.4s, v3.4s, #0
+ cmlt v26.4s, v4.4s, #0
+ cmlt v27.4s, v5.4s, #0
+ add v16.4s, v16.4s, v20.4s // diff + sign
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v23.4s
+ add v2.4s, v2.4s, v24.4s
+ add v3.4s, v3.4s, v25.4s
+ add v4.4s, v4.4s, v26.4s
+ add v5.4s, v5.4s, v27.4s
+ rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ rshrn v6.4h, v2.4s, #6
+ rshrn2 v6.8h, v3.4s, #6
+ rshrn v7.4h, v4.4s, #6
+ rshrn2 v7.8h, v5.4s, #6
+ add v2.8h, v16.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v17.8h, v0.8h
+ add v4.8h, v6.8h, v0.8h
+ add v5.8h, v7.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], #32
+ st1 {v4.8h, v5.8h}, [x6], #32
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #2
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.4s, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ addp v0.8h, v2.8h, v3.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v2.8h, v2.8h, v3.8h
+ uaddlv s2, v2.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ add v0.4s, v0.4s, v16.4s
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v2.8h, v2.8h, v4.8h
+ cmp w4, #32
+ uaddlv s2, v2.8h
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ add v24.4s, v24.4s, v25.4s
+ add v26.4s, v26.4s, v27.4s
+ add v0.4s, v24.4s, v26.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v4.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw v25.4s, v25.4s, v1.4h
+ uaddw v26.4s, v26.4s, v2.4h
+ uaddw v27.4s, v27.4s, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
+ add v0.8h, v0.8h, v4.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+ add v2.8h, v2.8h, v6.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v18.8h, v18.8h, v19.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ add v16.8h, v16.8h, v20.8h
+ add v18.8h, v18.8h, v22.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ shl v2.8h, v16.8h, #1
+ shl v3.8h, v18.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q5, [x10, #32]
+ ld1 {v3.8h, v4.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v5.8h, v5.8h, v5.8h
+ addp v3.8h, v3.8h, v4.8h
+ ldr q18, [x1, #32]
+ add v2.4h, v2.4h, v5.4h
+ ld1 {v16.8h, v17.8h}, [x1], x2
+ add v0.8h, v0.8h, v3.8h
+ ldr q21, [x10, #32]
+ ld1 {v19.8h, v20.8h}, [x10], x2
+ addp v18.8h, v18.8h, v18.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v21.8h, v21.8h, v21.8h
+ addp v19.8h, v19.8h, v20.8h
+ add v18.4h, v18.4h, v21.4h
+ add v16.8h, v16.8h, v19.8h
+ shl v1.4h, v2.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v18.4h, #1
+ shl v2.8h, v16.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ ld1 {v4.8h}, [x1], x2
+ ld1 {v6.8h}, [x10], x2
+ addp v0.8h, v0.8h, v4.8h
+ addp v2.8h, v2.8h, v6.8h
+ add v0.8h, v0.8h, v2.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v0.h[7]
+ trn2 v2.2d, v0.2d, v3.2d
+ trn1 v0.2d, v0.2d, v1.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q6, [x10, #32]
+ ld1 {v4.8h, v5.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v6.8h, v6.8h, v6.8h
+ addp v4.8h, v4.8h, v5.8h
+ shl v1.4h, v2.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v6.4h, #2
+ shl v2.8h, v4.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ addp v0.8h, v0.8h, v0.8h
+ addp v2.8h, v2.8h, v2.8h
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.4h}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.4h}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ ld1 {v3.8h}, [x10], x2
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v2.8h, v2.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ lsr x2, x2, #1 // Restore the stride to one line increments
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2
+ shl v2.8h, v2.8h, #3
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ shl v1.8h, v1.8h, #3
+ shl v0.8h, v0.8h, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl w6, w6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
new file mode 100644
index 0000000000..b1b2f8fe65
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -0,0 +1,3270 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+ .short 0, 0, 0, 0
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+ .short 0, 0, 0, 0
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+ .short 0, 0, 0, 0
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+ .short 0, 0, 0, 0
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro smull_smlal d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlal \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlal2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlsl \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlsl2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+ sqrshrn \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+ sqrshrn2 \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
+ load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
+ load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
+ load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
+ load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
+ load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
+ load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src
+ load_add_store4 , , , , , , , , v7.s, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , , , , , , v3.s, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v0.4h, w16
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ strh wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+.endif
+.if \shift > 0
+ srshr v16.8h, v16.8h, #\shift
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ subs w4, w4, #4
+ sub x0, x0, x1, lsl #2
+ uaddw v0.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v0.8h
+ uaddw v1.8h, v16.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+1:
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ subs w4, w4, #4
+ uaddw v21.8h, v16.8h, v1.8b
+ sqxtun v0.8b, v20.8h
+ uaddw v22.8h, v16.8h, v2.8b
+ sqxtun v1.8b, v21.8h
+ uaddw v23.8h, v16.8h, v3.8b
+ st1 {v0.8b}, [x0], x1
+ sqxtun v2.8b, v22.8h
+ st1 {v1.8b}, [x0], x1
+ sqxtun v3.8b, v23.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+1:
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ subs w4, w4, #4
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v3.16b}, [x0], x1
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1, lsl #2
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v1.16b}, [x0], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+1:
+ ld1 {v0.16b, v1.16b}, [x0], x1
+ subs w4, w4, #2
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v2.16b, v3.16b}, [x0]
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b, v1.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+1:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+ subs w4, w4, #1
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4h, v16.4h, v17.4h
+ sub v21.4h, v18.4h, v19.4h
+ sub v20.4h, v16.4h, v21.4h
+ sshr v20.4h, v20.4h, #1
+ sub v18.4h, v20.4h, v17.4h
+ sub v17.4h, v20.4h, v19.4h
+ add v19.4h, v21.4h, v18.4h
+ sub v16.4h, v16.4h, v17.4h
+.endm
+
+.macro idct_4 r0, r1, r2, r3, sz
+ smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
+ smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
+ smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v6, v6, v7, #12, \sz
+ sqrshrn_sz v7, v4, v5, #12, \sz
+ smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v2, v2, v3, #12, \sz
+ sqrshrn_sz v3, v4, v5, #12, \sz
+ sqadd \r0\sz, v2\sz, v6\sz
+ sqsub \r3\sz, v2\sz, v6\sz
+ sqadd \r1\sz, v3\sz, v7\sz
+ sqsub \r2\sz, v3\sz, v7\sz
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .4h
+ ret
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .8h
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v3.4s, v16.4h, v18.4h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull v7.4s, v17.4h, v0.h[3]
+ saddw v3.4s, v3.4s, v19.4h
+ smull v5.4s, v16.4h, v0.h[2]
+ smlsl v5.4s, v18.4h, v0.h[0]
+ smlsl v5.4s, v19.4h, v0.h[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[2]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ sqrshrn \o0\().4h, \o0\().4s, #12
+ sqrshrn \o2\().4h, \o2\().4s, #12
+ sqrshrn \o1\().4h, \o1\().4s, #12
+ sqrshrn \o3\().4h, \o3\().4s, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v2.4s, v16.4h, v18.4h
+ ssubl2 v3.4s, v16.8h, v18.8h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull2 v5.4s, v16.8h, v0.h[0]
+ smlal2 v5.4s, v18.8h, v0.h[1]
+ smlal2 v5.4s, v19.8h, v0.h[2]
+ saddw v2.4s, v2.4s, v19.4h
+ saddw2 v3.4s, v3.4s, v19.8h
+ smull v6.4s, v16.4h, v0.h[2]
+ smlsl v6.4s, v18.4h, v0.h[0]
+ smlsl v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v16.8h, v0.h[2]
+ smlsl2 v7.4s, v18.8h, v0.h[0]
+ smlsl2 v7.4s, v19.8h, v0.h[1]
+
+ mul v18.4s, v2.4s, v0.s[2]
+ mul v19.4s, v3.4s, v0.s[2]
+
+ smull v2.4s, v17.4h, v0.h[3]
+ smull2 v3.4s, v17.8h, v0.h[3]
+
+ add v16.4s, v4.4s, v2.4s // out0
+ add v17.4s, v5.4s, v3.4s
+
+ add v4.4s, v4.4s, v6.4s // out3
+ add v5.4s, v5.4s, v7.4s
+
+ add v6.4s, v6.4s, v2.4s // out1
+ add v7.4s, v7.4s, v3.4s
+
+ sub v4.4s, v4.4s, v2.4s // out3
+ sub v5.4s, v5.4s, v3.4s
+
+ sqrshrn v18.4h, v18.4s, #12
+ sqrshrn2 v18.8h, v19.4s, #12
+
+ sqrshrn \o0\().4h, v16.4s, #12
+ sqrshrn2 \o0\().8h, v17.4s, #12
+
+.ifc \o2, v17
+ mov v17.16b, v18.16b
+.endif
+
+ sqrshrn \o1\().4h, v6.4s, #12
+ sqrshrn2 \o1\().8h, v7.4s, #12
+
+ sqrshrn \o3\().4h, v4.4s, #12
+ sqrshrn2 \o3\().8h, v5.4s, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.4h, v16.4h, v0.h[0]
+ sqrdmulh v5.4h, v17.4h, v0.h[0]
+ sqrdmulh v6.4h, v18.4h, v0.h[0]
+ sqrdmulh v7.4h, v19.4h, v0.h[0]
+ sqadd v16.4h, v16.4h, v4.4h
+ sqadd v17.4h, v17.4h, v5.4h
+ sqadd v18.4h, v18.4h, v6.4h
+ sqadd v19.4h, v19.4h, v7.4h
+ ret
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.8h, v16.8h, v0.h[0]
+ sqrdmulh v5.8h, v17.8h, v0.h[0]
+ sqrdmulh v6.8h, v18.8h, v0.h[0]
+ sqrdmulh v7.8h, v19.8h, v0.h[0]
+ sqadd v16.8h, v16.8h, v4.8h
+ sqadd v17.8h, v17.8h, v5.8h
+ sqadd v18.8h, v18.8h, v6.8h
+ sqadd v19.8h, v19.8h, v7.8h
+ ret
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+ sqrdmulh v2.8h, \i, \c
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ mov x15, x30
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ sshr v16.4h, v16.4h, #2
+ sshr v17.4h, v17.4h, #2
+ sshr v18.4h, v18.4h, #2
+ sshr v19.4h, v19.4h, #2
+
+ iwht4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ blr x4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ sub x0, x0, x1, lsl #2
+ uaddw v16.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v16.8h
+ uaddw v18.8h, v18.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v18.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v4.8h, w16
+ strh wzr, [x2]
+ sqrdmulh v16.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[0], [x0], x1
+ sqrdmulh v20.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[1], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.s}[0], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ ld1 {v1.s}[1], [x0], x1
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4h_x4_neon
+ adr x5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
+ idct_4 \r0, \r2, \r4, \r6, \sz
+
+ smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
+ smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
+ smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
+ sqrshrn_sz \r1, v2, v3, #12, \sz // t4a
+ sqrshrn_sz \r7, v4, v5, #12, \sz // t7a
+ smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
+ sqrshrn_sz \r3, v6, v7, #12, \sz // t5a
+ sqrshrn_sz \r5, v2, v3, #12, \sz // t6a
+
+ sqadd v2\sz, \r1\sz, \r3\sz // t4
+ sqsub \r1\sz, \r1\sz, \r3\sz // t5a
+ sqadd v3\sz, \r7\sz, \r5\sz // t7
+ sqsub \r3\sz, \r7\sz, \r5\sz // t6a
+
+ smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
+ smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
+ sqrshrn_sz v4, v4, v5, #12, \sz // t5
+ sqrshrn_sz v5, v6, v7, #12, \sz // t6
+
+ sqsub \r7\sz, \r0\sz, v3\sz // out7
+ sqadd \r0\sz, \r0\sz, v3\sz // out0
+ sqadd \r1\sz, \r2\sz, v5\sz // out1
+ sqsub v6\sz, \r2\sz, v5\sz // out6
+ sqadd \r2\sz, \r4\sz, v4\sz // out2
+ sqsub \r5\sz, \r4\sz, v4\sz // out5
+ sqadd \r3\sz, \r6\sz, v2\sz // out3
+ sqsub \r4\sz, \r6\sz, v2\sz // out4
+ mov \r6\szb, v6\szb // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
+ movrel x16, iadst8_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+
+ smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz
+ smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz
+ smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t1a
+ smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz
+ smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2a
+ sqrshrn_sz v21, v2, v3, #12, \sz // t3a
+ smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz
+ smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v19, v6, v7, #12, \sz // t5a
+ smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v17, v4, v5, #12, \sz // t7a
+
+ sqadd v2\sz, v16\sz, v20\sz // t0
+ sqsub v3\sz, v16\sz, v20\sz // t4
+ sqadd v4\sz, v23\sz, v19\sz // t1
+ sqsub v5\sz, v23\sz, v19\sz // t5
+ sqadd v6\sz, v18\sz, v22\sz // t2
+ sqsub v7\sz, v18\sz, v22\sz // t6
+ sqadd v18\sz, v21\sz, v17\sz // t3
+ sqsub v19\sz, v21\sz, v17\sz // t7
+
+ smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz
+ smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz
+ smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz
+
+ sqrshrn_sz v3, v16, v17, #12, \sz // t4a
+ sqrshrn_sz v5, v20, v21, #12, \sz // t5a
+
+ smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz
+
+ sqrshrn_sz v7, v22, v23, #12, \sz // t6a
+ sqrshrn_sz v19, v16, v17, #12, \sz // t7a
+
+ sqadd \o0\()\sz, v2\sz, v6\sz // out0
+ sqsub v2\sz, v2\sz, v6\sz // t2
+ sqadd \o7\()\sz, v4\sz, v18\sz // out7
+ sqsub v4\sz, v4\sz, v18\sz // t3
+ sqneg \o7\()\sz, \o7\()\sz // out7
+
+ sqadd \o1\()\sz, v3\sz, v7\sz // out1
+ sqsub v3\sz, v3\sz, v7\sz // t6
+ sqadd \o6\()\sz, v5\sz, v19\sz // out6
+ sqsub v5\sz, v5\sz, v19\sz // t7
+ sqneg \o1\()\sz, \o1\()\sz // out1
+
+ smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
+ smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
+ smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
+ sqrshrn_sz v2, v18, v19, #12, \sz // out3
+ smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
+ sqrshrn_sz v3, v20, v21, #12, \sz // out5
+ sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+ sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
+
+ sqneg \o3\()\sz, v2\sz // out3
+ sqneg \o5\()\sz, v3\sz // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h
+ ret
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h
+ ret
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h
+ ret
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h
+ ret
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ sqshl v16.8h, v16.8h, #1
+ sqshl v17.8h, v17.8h, #1
+ sqshl v18.8h, v18.8h, #1
+ sqshl v19.8h, v19.8h, #1
+ sqshl v20.8h, v20.8h, #1
+ sqshl v21.8h, v21.8h, #1
+ sqshl v22.8h, v22.8h, #1
+ sqshl v23.8h, v23.8h, #1
+ ret
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ sqshl v16.4h, v16.4h, #1
+ sqshl v17.4h, v17.4h, #1
+ sqshl v18.4h, v18.4h, #1
+ sqshl v19.4h, v19.4h, #1
+ sqshl v20.4h, v20.4h, #1
+ sqshl v21.4h, v21.4h, #1
+ sqshl v22.4h, v22.4h, #1
+ sqshl v23.4h, v23.4h, #1
+ ret
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
+ ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+ srshr v16.8h, v16.8h, #1
+ srshr v17.8h, v17.8h, #1
+ srshr v18.8h, v18.8h, #1
+ srshr v19.8h, v19.8h, #1
+ srshr v20.8h, v20.8h, #1
+ srshr v21.8h, v21.8h, #1
+ srshr v22.8h, v22.8h, #1
+ srshr v23.8h, v23.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ adr x5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ adr x4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2], #32
+ ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2]
+
+ scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19
+
+ blr x4
+
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+.macro idct_16 sz, szb
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
+
+ smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
+ smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
+ smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
+ sqrshrn_sz v17, v2, v3, #12, \sz // t8a
+ sqrshrn_sz v31, v4, v5, #12, \sz // t15a
+ smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
+ smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
+ sqrshrn_sz v23, v6, v7, #12, \sz // t9a
+ sqrshrn_sz v25, v2, v3, #12, \sz // t14a
+ smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
+ smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t13a
+ smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
+ sqrshrn_sz v19, v2, v3, #12, \sz // t11a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t12a
+
+ sqsub v2\sz, v17\sz, v23\sz // t9
+ sqadd v17\sz, v17\sz, v23\sz // t8
+ sqsub v3\sz, v31\sz, v25\sz // t14
+ sqadd v31\sz, v31\sz, v25\sz // t15
+ sqsub v23\sz, v19\sz, v21\sz // t10
+ sqadd v19\sz, v19\sz, v21\sz // t11
+ sqadd v25\sz, v29\sz, v27\sz // t12
+ sqsub v29\sz, v29\sz, v27\sz // t13
+
+ smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a
+ smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t9a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t14a
+
+ smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
+ smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t13a
+ neg v6.4s, v6.4s
+.ifc \sz, .8h
+ neg v7.4s, v7.4s
+.endif
+ sqrshrn_sz v23, v6, v7, #12, \sz // t10a
+
+ sqsub v2\sz, v17\sz, v19\sz // t11a
+ sqadd v17\sz, v17\sz, v19\sz // t8a
+ sqsub v3\sz, v31\sz, v25\sz // t12a
+ sqadd v31\sz, v31\sz, v25\sz // t15a
+ sqadd v19\sz, v21\sz, v23\sz // t9
+ sqsub v21\sz, v21\sz, v23\sz // t10
+ sqsub v25\sz, v27\sz, v29\sz // t13
+ sqadd v27\sz, v27\sz, v29\sz // t14
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
+ smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+
+ sqrshrn_sz v4, v4, v5, #12, \sz // t11
+ sqrshrn_sz v5, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
+ sqrshrn_sz v2, v2, v3, #12, \sz // t10a
+ sqrshrn_sz v3, v6, v7, #12, \sz // t13a
+
+ sqadd v6\sz, v16\sz, v31\sz // out0
+ sqsub v31\sz, v16\sz, v31\sz // out15
+ mov v16\szb, v6\szb
+ sqadd v23\sz, v30\sz, v17\sz // out7
+ sqsub v7\sz, v30\sz, v17\sz // out8
+ sqadd v17\sz, v18\sz, v27\sz // out1
+ sqsub v30\sz, v18\sz, v27\sz // out14
+ sqadd v18\sz, v20\sz, v3\sz // out2
+ sqsub v29\sz, v20\sz, v3\sz // out13
+ sqadd v3\sz, v28\sz, v19\sz // out6
+ sqsub v25\sz, v28\sz, v19\sz // out9
+ sqadd v19\sz, v22\sz, v5\sz // out3
+ sqsub v28\sz, v22\sz, v5\sz // out12
+ sqadd v20\sz, v24\sz, v4\sz // out4
+ sqsub v27\sz, v24\sz, v4\sz // out11
+ sqadd v21\sz, v26\sz, v2\sz // out5
+ sqsub v26\sz, v26\sz, v2\sz // out10
+ mov v24\szb, v7\szb
+ mov v22\szb, v3\szb
+.endm
+
+function inv_dct_8h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .4h, .8b
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
+ movrel x16, iadst16_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ movrel x16, idct_coeffs
+
+ smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0
+ smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0
+ sqrshrn_sz v31, v4, v5, #12, \sz // t1
+ smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3
+ smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2
+ sqrshrn_sz v29, v2, v3, #12, \sz // t3
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5
+ smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4
+ sqrshrn_sz v27, v6, v7, #12, \sz // t5
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7
+ smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6
+ sqrshrn_sz v25, v4, v5, #12, \sz // t7
+ smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9
+ smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10
+ sqrshrn_sz v23, v6, v7, #12, \sz // t8
+ sqrshrn_sz v24, v2, v3, #12, \sz // t9
+ smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11
+ smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10
+ sqrshrn_sz v26, v6, v7, #12, \sz // t11
+ smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13
+ smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14
+ sqrshrn_sz v19, v2, v3, #12, \sz // t12
+ sqrshrn_sz v28, v4, v5, #12, \sz // t13
+ smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15
+ sqrshrn_sz v17, v6, v7, #12, \sz // t14
+ sqrshrn_sz v30, v2, v3, #12, \sz // t15
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2\sz, v16\sz, v23\sz // t8a
+ sqadd v16\sz, v16\sz, v23\sz // t0a
+ sqsub v3\sz, v31\sz, v24\sz // t9a
+ sqadd v31\sz, v31\sz, v24\sz // t1a
+ sqadd v23\sz, v18\sz, v21\sz // t2a
+ sqsub v18\sz, v18\sz, v21\sz // t10a
+ sqadd v24\sz, v29\sz, v26\sz // t3a
+ sqsub v29\sz, v29\sz, v26\sz // t11a
+ sqadd v21\sz, v20\sz, v19\sz // t4a
+ sqsub v20\sz, v20\sz, v19\sz // t12a
+ sqadd v26\sz, v27\sz, v28\sz // t5a
+ sqsub v27\sz, v27\sz, v28\sz // t13a
+ sqadd v19\sz, v22\sz, v17\sz // t6a
+ sqsub v22\sz, v22\sz, v17\sz // t14a
+ sqadd v28\sz, v25\sz, v30\sz // t7a
+ sqsub v25\sz, v25\sz, v30\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8
+ smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9
+ smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10
+ sqrshrn_sz v17, v4, v5, #12, \sz // t8
+ sqrshrn_sz v30, v6, v7, #12, \sz // t9
+ smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12
+ sqrshrn_sz v18, v2, v3, #12, \sz // t10
+ sqrshrn_sz v29, v4, v5, #12, \sz // t11
+ smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14
+ sqrshrn_sz v27, v6, v7, #12, \sz // t12
+ sqrshrn_sz v20, v2, v3, #12, \sz // t13
+ smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15
+ sqrshrn_sz v25, v4, v5, #12, \sz // t14
+ sqrshrn_sz v22, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t4
+ sqadd v16\sz, v16\sz, v21\sz // t0
+ sqsub v3\sz, v31\sz, v26\sz // t5
+ sqadd v31\sz, v31\sz, v26\sz // t1
+ sqadd v21\sz, v23\sz, v19\sz // t2
+ sqsub v23\sz, v23\sz, v19\sz // t6
+ sqadd v26\sz, v24\sz, v28\sz // t3
+ sqsub v24\sz, v24\sz, v28\sz // t7
+ sqadd v19\sz, v17\sz, v27\sz // t8a
+ sqsub v17\sz, v17\sz, v27\sz // t12a
+ sqadd v28\sz, v30\sz, v20\sz // t9a
+ sqsub v30\sz, v30\sz, v20\sz // t13a
+ sqadd v27\sz, v18\sz, v25\sz // t10a
+ sqsub v18\sz, v18\sz, v25\sz // t14a
+ sqadd v20\sz, v29\sz, v22\sz // t11a
+ sqsub v29\sz, v29\sz, v22\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a
+ smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a
+ smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
+ sqrshrn_sz v22, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v25, v6, v7, #12, \sz // t5a
+ smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
+ smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12
+ sqrshrn_sz v24, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t7a
+ smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13
+ smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14
+ sqrshrn_sz v17, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15
+ sqrshrn_sz v29, v2, v3, #12, \sz // t13
+ sqrshrn_sz v30, v4, v5, #12, \sz // t14
+ sqrshrn_sz v18, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t2a
+.ifc \o0, v16
+ sqadd \o0\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+.else
+ sqadd v4\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+ mov \o0\szb, v4\szb
+.endif
+ sqneg \o15\sz, \o15\sz // out15
+
+ sqsub v3\sz, v29\sz, v18\sz // t15a
+ sqadd \o13\sz, v29\sz, v18\sz // out13
+ sqadd \o2\sz, v17\sz, v30\sz // out2
+ sqsub v26\sz, v17\sz, v30\sz // t14a
+ sqneg \o13\sz, \o13\sz // out13
+
+ sqadd \o1\sz, v19\sz, v27\sz // out1
+ sqsub v27\sz, v19\sz, v27\sz // t10
+ sqadd \o14\sz, v28\sz, v20\sz // out14
+ sqsub v20\sz, v28\sz, v20\sz // t11
+ sqneg \o1\sz, \o1\sz // out1
+
+ sqadd \o3\sz, v22\sz, v24\sz // out3
+ sqsub v22\sz, v22\sz, v24\sz // t6
+ sqadd \o12\sz, v25\sz, v23\sz // out12
+ sqsub v23\sz, v25\sz, v23\sz // t7
+ sqneg \o3\sz, \o3\sz // out3
+
+ smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+ smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+ smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
+
+ sqrshrn_sz v24, v24, v25, #12, \sz // out8
+ sqrshrn_sz v4, v4, v5, #12, \sz // out7
+ sqrshrn_sz v5, v6, v7, #12, \sz // out5
+ smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+ smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+ sqrshrn_sz v26, v6, v7, #12, \sz // out10
+
+ smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+ smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+ smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
+
+ sqrshrn_sz \o4, v2, v3, #12, \sz // out4
+ sqrshrn_sz v6, v6, v7, #12, \sz // out11
+ sqrshrn_sz v7, v21, v25, #12, \sz // out9
+ sqrshrn_sz \o6, v22, v23, #12, \sz // out6
+
+.ifc \o8, v23
+ mov \o8\szb, v24\szb
+ mov \o10\szb, v26\szb
+.endif
+
+ sqneg \o7\sz, v4\sz // out7
+ sqneg \o5\sz, v5\sz // out5
+ sqneg \o11\sz, v6\sz // out11
+ sqneg \o9\sz, v7\sz // out9
+.endm
+
+function inv_adst_8h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
+ ret
+endfunc
+
+function inv_flipadst_8h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
+ ret
+endfunc
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
+ ret
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
+ ret
+endfunc
+
+function inv_identity_8h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.8h, v\i\().8h, v0.h[0]
+ sqadd v\i\().8h, v\i\().8h, v\i\().8h
+ sqadd v\i\().8h, v\i\().8h, v2.8h
+.endr
+ ret
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4h, v\i\().4h, v0.h[0]
+ sqadd v\i\().4h, v\i\().4h, v\i\().4h
+ sqadd v\i\().4h, v\i\().4h, v2.4h
+.endr
+ ret
+endfunc
+
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ sshr v2.8h, v2.8h, #1
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
+ AARCH64_VALID_CALL_TARGET
+ mov x14, x30
+ movi v7.8h, #0
+.if \identity
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.elseif \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+ identity_8x16_shift2 v0.h[0]
+.else
+ blr x4
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ srshr \i, \i, #\shift
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+.irp i, 0, 8
+ add x6, sp, #(\i*16*2)
+.if \i == 8
+ cmp w3, w13
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ blr x9
+.endr
+ b 2f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+2:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+.macro def_fn_16x16 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+.ifc \txfm1, identity
+ adr x9, inv_txfm_horz_identity_16x8_neon
+.else
+ adr x9, inv_txfm_horz_16x8_neon
+ adr x4, inv_\txfm1\()_8h_x16_neon
+.endif
+ adr x5, inv_\txfm2\()_8h_x16_neon
+ mov x13, #\eob_half
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+ mov x15, x30
+ movi v4.8h, #0
+
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ identity_8x16_shift1 v0.h[0]
+.else
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ blr x4
+
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+.else
+ ins v24.d[1], v28.d[0]
+ ins v25.d[1], v29.d[0]
+ ins v26.d[1], v30.d[0]
+ ins v27.d[1], v31.d[0]
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #8
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ mov x15, x30
+ movi v2.8h, #0
+
+ mov x11, #32
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ blr x4
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+.endif
+ transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
+ ins v28.d[0], v24.d[1]
+ ins v29.d[0], v25.d[1]
+ ins v30.d[0], v26.d[1]
+ ins v31.d[0], v27.d[1]
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ movi v2.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x2]
+ st1 {v2.8h}, [x2], x11
+.endr
+.ifc \variant, identity_
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
+ blr x4
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_4h_x\h\()_neon
+ mov w13, #\eob_half
+.else
+ adr x4, inv_\txfm1\()_4h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], #16
+.endr
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x16_shift1 v0.h[0]
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v24.16b
+ mov v17.16b, v25.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v27.16b
+ mov v20.16b, v28.16b
+ mov v21.16b, v29.16b
+ mov v22.16b, v30.16b
+ mov v23.16b, v31.16b
+.else
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+ srshr v20.8h, v28.8h, #1
+ srshr v21.8h, v29.8h, #1
+ srshr v22.8h, v30.8h, #1
+ srshr v23.8h, v31.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ add x0, x0, #8
+ load_add_store_8x8 x0, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ mov x11, #32
+
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+ srshr v28.8h, v20.8h, #1
+ srshr v29.8h, v21.8h, #1
+ srshr v30.8h, v22.8h, #1
+ srshr v31.8h, v23.8h, #1
+.endif
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.if \w == 8
+ mov x13, #\eob_half
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_8h_x16_neon, export=1
+ movrel x16, idct_coeffs, 2*16
+ ld1 {v0.8h, v1.8h}, [x16]
+ sub x16, x16, #2*16
+
+ smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
+ smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
+ smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
+ sqrshrn_sz v16, v2, v3, #12, .8h // t16a
+ sqrshrn_sz v31, v4, v5, #12, .8h // t31a
+ smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
+ smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t17a
+ sqrshrn_sz v23, v2, v3, #12, .8h // t30a
+ smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
+ smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
+ smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t19a
+ sqrshrn_sz v19, v4, v5, #12, .8h // t28a
+ smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
+ smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
+ sqrshrn_sz v18, v6, v7, #12, .8h // t20a
+ sqrshrn_sz v29, v2, v3, #12, .8h // t27a
+ smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
+ smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
+ sqrshrn_sz v26, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v21, v6, v7, #12, .8h // t26a
+ smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
+ smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
+ sqrshrn_sz v22, v2, v3, #12, .8h // t22a
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25a
+ smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
+ sqrshrn_sz v30, v6, v7, #12, .8h // t23a
+ sqrshrn_sz v17, v2, v3, #12, .8h // t24a
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2.8h, v16.8h, v24.8h // t17
+ sqadd v16.8h, v16.8h, v24.8h // t16
+ sqsub v3.8h, v31.8h, v23.8h // t30
+ sqadd v31.8h, v31.8h, v23.8h // t31
+ sqsub v24.8h, v28.8h, v20.8h // t18
+ sqadd v28.8h, v28.8h, v20.8h // t19
+ sqadd v23.8h, v18.8h, v26.8h // t20
+ sqsub v18.8h, v18.8h, v26.8h // t21
+ sqsub v20.8h, v30.8h, v22.8h // t22
+ sqadd v30.8h, v30.8h, v22.8h // t23
+ sqadd v26.8h, v17.8h, v25.8h // t24
+ sqsub v17.8h, v17.8h, v25.8h // t25
+ sqsub v22.8h, v29.8h, v21.8h // t26
+ sqadd v29.8h, v29.8h, v21.8h // t27
+ sqadd v25.8h, v19.8h, v27.8h // t28
+ sqsub v19.8h, v19.8h, v27.8h // t29
+
+ smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a
+ smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a
+ smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t17a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t30a
+ neg v2.4s, v2.4s // -> t18a
+ neg v3.4s, v3.4s // -> t18a
+ smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
+ smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
+ sqrshrn_sz v19, v2, v3, #12, .8h // t18a
+ sqrshrn_sz v24, v4, v5, #12, .8h // t29a
+ smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
+ smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
+ sqrshrn_sz v22, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v18, v2, v3, #12, .8h // t26a
+ neg v4.4s, v4.4s // -> t22a
+ neg v5.4s, v5.4s // -> t22a
+ smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
+ sqrshrn_sz v17, v4, v5, #12, .8h // t22a
+ sqrshrn_sz v20, v6, v7, #12, .8h // t25a
+
+ sqsub v2.8h, v27.8h, v24.8h // t29
+ sqadd v27.8h, v27.8h, v24.8h // t30
+ sqsub v3.8h, v21.8h, v19.8h // t18
+ sqadd v21.8h, v21.8h, v19.8h // t17
+ sqsub v24.8h, v16.8h, v28.8h // t19a
+ sqadd v16.8h, v16.8h, v28.8h // t16a
+ sqsub v19.8h, v30.8h, v23.8h // t20a
+ sqadd v30.8h, v30.8h, v23.8h // t23a
+ sqsub v28.8h, v17.8h, v22.8h // t21
+ sqadd v17.8h, v17.8h, v22.8h // t22
+ sqadd v23.8h, v26.8h, v29.8h // t24a
+ sqsub v26.8h, v26.8h, v29.8h // t27a
+ sqadd v22.8h, v20.8h, v18.8h // t25
+ sqsub v20.8h, v20.8h, v18.8h // t26
+ sqsub v29.8h, v31.8h, v25.8h // t28a
+ sqadd v31.8h, v31.8h, v25.8h // t31a
+
+ smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a
+ smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a
+ smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19
+ sqrshrn_sz v18, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28
+ smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20
+ sqrshrn_sz v29, v2, v3, #12, .8h // t19
+ sqrshrn_sz v24, v4, v5, #12, .8h // t28
+ neg v6.4s, v6.4s // -> t20
+ neg v7.4s, v7.4s // -> t20
+ smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27
+ smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
+ sqrshrn_sz v26, v6, v7, #12, .8h // t20
+ sqrshrn_sz v19, v2, v3, #12, .8h // t27
+ neg v4.4s, v4.4s // -> t21a
+ neg v5.4s, v5.4s // -> t21a
+ smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v28, v6, v7, #12, .8h // t26a
+
+ sqsub v2.8h, v16.8h, v30.8h // t23
+ sqadd v16.8h, v16.8h, v30.8h // t16 = out16
+ sqsub v3.8h, v31.8h, v23.8h // t24
+ sqadd v31.8h, v31.8h, v23.8h // t31 = out31
+ sqsub v23.8h, v21.8h, v17.8h // t22a
+ sqadd v17.8h, v21.8h, v17.8h // t17a = out17
+ sqadd v30.8h, v27.8h, v22.8h // t30a = out30
+ sqsub v21.8h, v27.8h, v22.8h // t25a
+ sqsub v27.8h, v18.8h, v20.8h // t21
+ sqadd v18.8h, v18.8h, v20.8h // t18 = out18
+ sqadd v4.8h, v29.8h, v26.8h // t19a = out19
+ sqsub v26.8h, v29.8h, v26.8h // t20a
+ sqadd v29.8h, v25.8h, v28.8h // t29 = out29
+ sqsub v25.8h, v25.8h, v28.8h // t26
+ sqadd v28.8h, v24.8h, v19.8h // t28a = out28
+ sqsub v24.8h, v24.8h, v19.8h // t27a
+ mov v19.16b, v4.16b // out19
+
+ smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
+ smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
+ sqrshrn_sz v20, v4, v5, #12, .8h // t20
+ sqrshrn_sz v22, v6, v7, #12, .8h // t27
+
+ smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
+ smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
+ mov v27.16b, v22.16b // t27
+ sqrshrn_sz v26, v4, v5, #12, .8h // t26a
+
+ smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+ smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
+ sqrshrn_sz v21, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v22, v24, v25, #12, .8h // t22
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
+ sqrshrn_sz v23, v4, v5, #12, .8h // t23a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x8_neon
+ mov x14, x30
+ movi v7.8h, #0
+ lsl x8, x8, #1
+.if \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_8h_x16_neon
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.macro store1 r0, r1
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ add x6, x6, #32
+.endm
+ store1 v16.8h, v24.8h
+ store1 v17.8h, v25.8h
+ store1 v18.8h, v26.8h
+ store1 v19.8h, v27.8h
+ store1 v20.8h, v28.8h
+ store1 v21.8h, v29.8h
+ store1 v22.8h, v30.8h
+ store1 v23.8h, v31.8h
+.purgem store1
+ sub x6, x6, #64*8
+
+ movi v7.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
+ scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_8h_x16_neon
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+ transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
+.macro store2 r0, r1, shift
+ ld1 {v4.8h, v5.8h}, [x6]
+ sqsub v7.8h, v4.8h, \r0
+ sqsub v6.8h, v5.8h, \r1
+ sqadd v4.8h, v4.8h, \r0
+ sqadd v5.8h, v5.8h, \r1
+ rev64 v6.8h, v6.8h
+ rev64 v7.8h, v7.8h
+ srshr v4.8h, v4.8h, #\shift
+ srshr v5.8h, v5.8h, #\shift
+ srshr v6.8h, v6.8h, #\shift
+ srshr v7.8h, v7.8h, #\shift
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {v4.8h, v5.8h}, [x6], #32
+ ext v7.16b, v7.16b, v7.16b, #8
+ st1 {v6.8h, v7.8h}, [x6], #32
+.endm
+
+ store2 v31.8h, v23.8h, \shift
+ store2 v30.8h, v22.8h, \shift
+ store2 v29.8h, v21.8h, \shift
+ store2 v28.8h, v20.8h, \shift
+ store2 v27.8h, v19.8h, \shift
+ store2 v26.8h, v18.8h, \shift
+ store2 v25.8h, v17.8h, \shift
+ store2 v24.8h, v16.8h, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl inv_dct_8h_x16_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl inv_dct32_odd_8h_x16_neon
+
+ neg x9, x8
+ mov x10, x6
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8b}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8b}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8b}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ uaddw v5.8h, v5.8h, v2.8b
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ sqxtun v2.8b, v5.8h
+ ld1 {v5.8h}, [x7], \stride
+ uaddw v6.8h, v6.8h, v3.8b
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8b}, [x6], x1
+ ld1 {v2.8b}, [x10], x1
+ sqxtun v3.8b, v6.8h
+ uaddw v7.8h, v7.8h, v4.8b
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8b}, [x6], x1
+ sqxtun v4.8b, v7.8h
+ uaddw v5.8h, v5.8h, v2.8b
+ st1 {v4.8b}, [x6], x1
+ sqxtun v2.8b, v5.8h
+ st1 {v2.8b}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 36, 136, 300, 1024
+endconst
+
+const eob_16x32
+ .short 36, 151, 279, 512
+endconst
+
+const eob_16x32_shortside
+ .short 36, 512
+endconst
+
+const eob_8x32
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_32x32
+
+ mov x8, #2*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32
+2:
+ add w9, w9, #8
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ mov w16, #2896*8
+ mov w17, #2*(5793-4096)*8
+ dup v1.4h, w16
+ movi v0.8h, #0
+ mov v1.h[1], w17
+ movrel x13, eob_16x32\hshort
+
+ mov x8, #2*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort
+2:
+ add w9, w9, #8
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 v1.h[1]
+.else
+ // 32x16
+ shift_8_regs sqshl, 1
+ identity_8x8 v1.h[1]
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_8x32
+
+ mov w8, #2*\h
+1:
+ ldrh w12, [x13], #2
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs srshr, 1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #2*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_8h_x16_neon
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #2*32
+ bl inv_txfm_horz_scale_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ adr x5, inv_dct_8h_x16_neon
+
+.irp i, 0, 8
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, #36
+ b.lt 1f
+.endif
+ mov x8, #2*16
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.8h, #0
+ mov x8, #2*32
+ mov w9, #32
+ mov x6, sp
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
+.endr
+ ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
+ sub w9, w9, #8
+ add x2, x2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ srshr v\i\().8h, v\i\().8h, #2
+.endr
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ cmp w3, w12
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #8
+.rept 2
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ mov x6, sp
+ mov x7, x2
+ mov x8, #8*2
+ bl inv_txfm_horz_dct_32x8_neon
+
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.8h, v1.8h}, [x17], #32
+
+ sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a
+ sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a
+ sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a
+ sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a
+ sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a
+ sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a
+ sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a
+ sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t33
+ sqsub v26.8h, v19.8h, v18.8h // t34
+ sqadd v27.8h, v19.8h, v18.8h // t35
+ sqadd v28.8h, v20.8h, v21.8h // t60
+ sqsub v29.8h, v20.8h, v21.8h // t61
+ sqsub v30.8h, v23.8h, v22.8h // t62
+ sqadd v31.8h, v23.8h, v22.8h // t63
+
+ smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
+ smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
+ neg v2.4s, v2.4s // t34a
+ neg v3.4s, v3.4s // t34a
+ smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
+ sqrshrn_sz v26, v2, v3, #12, .8h // t34a
+ smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
+ sqrshrn_sz v29, v4, v5, #12, .8h // t61a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t33a
+ sqrshrn_sz v30, v2, v3, #12, .8h // t62a
+
+ sqadd v16.8h, v24.8h, v27.8h // t32a
+ sqsub v19.8h, v24.8h, v27.8h // t35a
+ sqadd v17.8h, v25.8h, v26.8h // t33
+ sqsub v18.8h, v25.8h, v26.8h // t34
+ sqsub v20.8h, v31.8h, v28.8h // t60a
+ sqadd v23.8h, v31.8h, v28.8h // t63a
+ sqsub v21.8h, v30.8h, v29.8h // t61
+ sqadd v22.8h, v30.8h, v29.8h // t62
+
+ smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
+ smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
+ smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60
+ sqrshrn_sz v21, v2, v3, #12, .8h // t61a
+ sqrshrn_sz v18, v4, v5, #12, .8h // t34a
+ smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35
+ sqrshrn_sz v20, v6, v7, #12, .8h // t60
+ sqrshrn_sz v19, v2, v3, #12, .8h // t35
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #2*8*0] // t32a
+ ldr q17, [x9, #2*8*8] // t39a
+ ldr q18, [x9, #2*8*0] // t63a
+ ldr q19, [x6, #2*8*8] // t56a
+ ldr q20, [x6, #2*8*16] // t40a
+ ldr q21, [x9, #2*8*24] // t47a
+ ldr q22, [x9, #2*8*16] // t55a
+ ldr q23, [x6, #2*8*24] // t48a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t39
+ sqadd v26.8h, v18.8h, v19.8h // t63
+ sqsub v27.8h, v18.8h, v19.8h // t56
+ sqsub v28.8h, v21.8h, v20.8h // t40
+ sqadd v29.8h, v21.8h, v20.8h // t47
+ sqadd v30.8h, v23.8h, v22.8h // t48
+ sqsub v31.8h, v23.8h, v22.8h // t55
+
+ smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
+ smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
+ smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
+ sqrshrn_sz v25, v2, v3, #12, .8h // t56a
+ sqrshrn_sz v27, v4, v5, #12, .8h // t39a
+ neg v6.4s, v6.4s // t40a
+ neg v7.4s, v7.4s // t40a
+ smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
+ sqrshrn_sz v31, v6, v7, #12, .8h // t40a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t55a
+
+ sqadd v16.8h, v24.8h, v29.8h // t32a
+ sqsub v19.8h, v24.8h, v29.8h // t47a
+ sqadd v17.8h, v27.8h, v31.8h // t39
+ sqsub v18.8h, v27.8h, v31.8h // t40
+ sqsub v20.8h, v26.8h, v30.8h // t48a
+ sqadd v23.8h, v26.8h, v30.8h // t63a
+ sqsub v21.8h, v25.8h, v28.8h // t55
+ sqadd v22.8h, v25.8h, v28.8h // t56
+
+ smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
+ smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
+ smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47
+ sqrshrn_sz v18, v2, v3, #12, .8h // t40a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t55a
+ smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48
+ sqrshrn_sz v19, v6, v7, #12, .8h // t47
+ sqrshrn_sz v20, v2, v3, #12, .8h // t48
+
+ str q16, [x6, #2*8*0] // t32a
+ str q17, [x9, #2*8*0] // t39
+ str q18, [x6, #2*8*8] // t40a
+ str q19, [x9, #2*8*8] // t47
+ str q20, [x6, #2*8*16] // t48
+ str q21, [x9, #2*8*16] // t55a
+ str q22, [x6, #2*8*24] // t56
+ str q23, [x9, #2*8*24] // t63a
+
+ add x6, x6, #2*8
+ sub x9, x9, #2*8
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov \gpr, \val
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_8h_x16_neon
+
+ store16 x6
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_8h_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.8h}, [x6], #16
+ ld1 {v3.8h}, [x6], #16
+ sqadd v6.8h, v2.8h, \r0
+ sqsub \r0, v2.8h, \r0
+ ld1 {v4.8h}, [x6], #16
+ sqadd v7.8h, v3.8h, \r1
+ sqsub \r1, v3.8h, \r1
+ ld1 {v5.8h}, [x6], #16
+ sqadd v2.8h, v4.8h, \r2
+ sub x6, x6, #16*4
+ sqsub \r2, v4.8h, \r2
+ st1 {v6.8h}, [x6], #16
+ st1 {\r0}, [x10], x9
+ sqadd v3.8h, v5.8h, \r3
+ sqsub \r3, v5.8h, \r3
+ st1 {v7.8h}, [x6], #16
+ st1 {\r1}, [x10], x9
+ st1 {v2.8h}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.8h}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.8h, v30.8h, v29.8h, v28.8h
+ store_addsub v27.8h, v26.8h, v25.8h, v24.8h
+ store_addsub v23.8h, v22.8h, v21.8h, v20.8h
+ store_addsub v19.8h, v18.8h, v17.8h, v16.8h
+.purgem store_addsub
+
+ add x6, x6, #2*8*16
+
+ movrel x17, idct64_coeffs
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.8h}, [x7] // in1 (offset 0)
+ ld1 {v17.8h}, [x9] // in31 (offset 15)
+ ld1 {v18.8h}, [x10] // in17 (offset 8)
+ ld1 {v19.8h}, [x11] // in15 (offset 7)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.8h}, [x10] // in7 (offset 3)
+ ld1 {v17.8h}, [x11] // in25 (offset 12)
+ ld1 {v18.8h}, [x9] // in23 (offset 11)
+ ld1 {v19.8h}, [x7] // in9 (offset 4)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #2*8*32
+ add x9, x6, #2*8*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x8_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-2*8*4
+
+ dup v7.8h, w12
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.8h, \src0, \src1
+ sqadd v0.8h, \src0, \src1
+ sqsub v3.8h, \src2, \src3
+ srshl v1.8h, v1.8h, v7.8h
+ sqadd v2.8h, \src2, \src3
+ srshl v0.8h, v0.8h, v7.8h
+ srshl v3.8h, v3.8h, v7.8h
+ rev64 v1.8h, v1.8h
+ srshl v2.8h, v2.8h, v7.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v0.8h}, [x6], x10
+ ext v3.16b, v3.16b, v3.16b, #8
+ st1 {v1.8h}, [x9], x10
+ st1 {v2.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ store_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ store_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ store_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem store_addsub
+ sub x6, x6, x10, lsl #3
+ sub x9, x9, x10, lsl #3
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8b}, [x6], x1
+ ld1 {v1.8b}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8b}, [x6]
+ sqsub v5.8h, \src0, \src1
+ ld1 {v3.8b}, [x9]
+ sqadd v6.8h, \src2, \src3
+ sqsub v7.8h, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr v6.8h, v6.8h, #4
+ uaddw v4.8h, v4.8h, v0.8b
+ srshr v7.8h, v7.8h, #4
+ uaddw v5.8h, v5.8h, v1.8b
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v0.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v1.8b, v5.8h
+ st1 {v0.8b}, [x6], x1
+ sqxtun v2.8b, v6.8h
+ st1 {v1.8b}, [x9], x10
+ sqxtun v3.8b, v7.8h
+ st1 {v2.8b}, [x6], x1
+ st1 {v3.8b}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*8*2
+ add x4, sp, #64*8*2
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 8
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ adr x5, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
new file mode 100644
index 0000000000..eee3a9636d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -0,0 +1,3648 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+// int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mla \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mls \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
+ load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+ load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+ load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+ load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+ load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+ load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+ load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+ load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+ load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+ load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+ load_add_store , , , , v27.8h, v26.8h, \dst, \src
+ load_add_store , , , , , v27.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
+ load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
+ load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
+ load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
+ load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
+ load_add_store4 , , , , , , , v23.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , , , , , v3.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v0.2s, w16
+ sqrdmulh v20.4s, v16.4s, v0.s[0]
+ str wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v20.4s, v20.4s, v0.s[0]
+.endif
+.if \shift > 0
+ sqrshrn v16.4h, v20.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+.else
+ sqxtn v16.4h, v20.4s
+ sqxtn2 v16.8h, v20.4s
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[1]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.d}[1], [x0], x1
+ usqadd v0.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ usqadd v1.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h}, [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ ld1 {v2.8h}, [x0], x1
+ usqadd v1.8h, v16.8h
+ ld1 {v3.8h}, [x0], x1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.8h}, [x0], x1
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h}, [x0], x1
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ sub x0, x0, x1, lsl #1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x1, x1, #64
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+ usqadd v1.8h, v16.8h
+ sub x0, x0, #64
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ usqadd v4.8h, v16.8h
+ usqadd v5.8h, v16.8h
+ usqadd v6.8h, v16.8h
+ usqadd v7.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ smin v6.8h, v6.8h, v31.8h
+ smin v7.8h, v7.8h, v31.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4s, v16.4s, v17.4s
+ sub v21.4s, v18.4s, v19.4s
+ sub v20.4s, v16.4s, v21.4s
+ sshr v20.4s, v20.4s, #1
+ sub v18.4s, v20.4s, v17.4s
+ sub v17.4s, v20.4s, v19.4s
+ add v19.4s, v21.4s, v18.4s
+ sub v16.4s, v16.4s, v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+ mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
+ mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
+ mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
+ mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
+ srshr v6.4s, v6.4s, #12
+ srshr v2.4s, v2.4s, #12
+ srshr v7.4s, v4.4s, #12
+ srshr v3.4s, v3.4s, #12
+ sqadd \r0\().4s, v2.4s, v6.4s
+ sqsub \r3\().4s, v2.4s, v6.4s
+ sqadd \r1\().4s, v3.4s, v7.4s
+ sqsub \r2\().4s, v3.4s, v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+ idct_4 v16, v17, v18, v19
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.4s}, [x16]
+
+ sub v3.4s, v16.4s, v18.4s
+ mul v4.4s, v16.4s, v0.s[0]
+ mla v4.4s, v18.4s, v0.s[1]
+ mla v4.4s, v19.4s, v0.s[2]
+ mul v7.4s, v17.4s, v0.s[3]
+ add v3.4s, v3.4s, v19.4s
+ mul v5.4s, v16.4s, v0.s[2]
+ mls v5.4s, v18.4s, v0.s[0]
+ mls v5.4s, v19.4s, v0.s[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[3]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ srshr \o0\().4s, \o0\().4s, #12
+ srshr \o2\().4s, \o2\().4s, #12
+ srshr \o1\().4s, \o1\().4s, #12
+ srshr \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+ sqrdmulh v4.4s, v16.4s, v0.s[0]
+ sqrdmulh v5.4s, v17.4s, v0.s[0]
+ sqrdmulh v6.4s, v18.4s, v0.s[0]
+ sqrdmulh v7.4s, v19.4s, v0.s[0]
+ sqadd v16.4s, v16.4s, v4.4s
+ sqadd v17.4s, v17.4s, v5.4s
+ sqadd v18.4s, v18.4s, v6.4s
+ sqadd v19.4s, v19.4s, v7.4s
+ ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ mov x15, x30
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ sshr v16.4s, v16.4s, #2
+ sshr v17.4s, v17.4s, #2
+ sshr v18.4s, v18.4s, #2
+ sshr v19.4s, v19.4s, #2
+
+ iwht4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v16.4h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqxtn2 v16.8h, v17.4s
+ ld1 {v1.d}[0], [x0], x1
+ sqxtn v18.4h, v18.4s
+ ld1 {v1.d}[1], [x0], x1
+ sqxtn2 v18.8h, v19.4s
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ blr x4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x0, x0, x1, lsl #2
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v18.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v4.2s, w16
+ str wzr, [x2]
+ sqrdmulh v16.4s, v16.4s, v4.s[0]
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v20.4h, v16.4s
+ sqxtn2 v20.8h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqrdmulh v20.8h, v20.8h, v4.h[1]
+ ld1 {v1.d}[0], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.d}[1], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ movi v30.8h, #0
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4s_x4_neon
+ movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4 \r0, \r2, \r4, \r6
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
+ mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
+ srshr \r1\().4s, v2.4s, #12 // t4a
+ srshr \r7\().4s, v3.4s, #12 // t7a
+ srshr \r3\().4s, v6.4s, #12 // t5a
+ srshr \r5\().4s, v7.4s, #12 // t6a
+
+ sqadd v2.4s, \r1\().4s, \r3\().4s // t4
+ sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
+ sqadd v3.4s, \r7\().4s, \r5\().4s // t7
+ sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
+
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+ mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
+
+ sqsub \r7\().4s, \r0\().4s, v3.4s // out7
+ sqadd \r0\().4s, \r0\().4s, v3.4s // out0
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
+ sqadd \r3\().4s, \r6\().4s, v2.4s // out3
+ sqsub \r4\().4s, \r6\().4s, v2.4s // out4
+ mov \r6\().16b, v6.16b // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel x16, iadst8_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v23, v16, v0.s[0], v0.s[1]
+ mul_mls v4, v23, v16, v0.s[1], v0.s[0]
+ mul_mla v6, v21, v18, v0.s[2], v0.s[3]
+ srshr v16.4s, v2.4s, #12 // t0a
+ srshr v23.4s, v4.4s, #12 // t1a
+ mul_mls v2, v21, v18, v0.s[3], v0.s[2]
+ mul_mla v4, v19, v20, v1.s[0], v1.s[1]
+ srshr v18.4s, v6.4s, #12 // t2a
+ srshr v21.4s, v2.4s, #12 // t3a
+ mul_mls v6, v19, v20, v1.s[1], v1.s[0]
+ mul_mla v2, v17, v22, v1.s[2], v1.s[3]
+ srshr v20.4s, v4.4s, #12 // t4a
+ srshr v19.4s, v6.4s, #12 // t5a
+ mul_mls v4, v17, v22, v1.s[3], v1.s[2]
+ srshr v22.4s, v2.4s, #12 // t6a
+ srshr v17.4s, v4.4s, #12 // t7a
+
+ ld1 {v0.4s}, [x16]
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
+ sqadd v2.4s, v16.4s, v20.4s // t0
+ sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ sqadd v4.4s, v23.4s, v19.4s // t1
+ sqsub v5.4s, v23.4s, v19.4s // t5
+ sqadd v6.4s, v18.4s, v22.4s // t2
+ sqsub v7.4s, v18.4s, v22.4s // t6
+ sqadd v18.4s, v21.4s, v17.4s // t3
+ sqsub v19.4s, v21.4s, v17.4s // t7
+
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
+ mul_mla v16, v3, v5, v0.s[3], v0.s[2]
+ mul_mls v20, v3, v5, v0.s[2], v0.s[3]
+ mul_mls v22, v19, v7, v0.s[3], v0.s[2]
+
+ srshr v3.4s, v16.4s, #12 // t4a
+ srshr v5.4s, v20.4s, #12 // t5a
+
+ mul_mla v16, v19, v7, v0.s[2], v0.s[3]
+
+ srshr v7.4s, v22.4s, #12 // t6a
+ srshr v19.4s, v16.4s, #12 // t7a
+
+ sqadd \o0\().4s, v2.4s, v6.4s // out0
+ sqsub v2.4s, v2.4s, v6.4s // t2
+ sqadd \o7\().4s, v4.4s, v18.4s // out7
+ sqsub v4.4s, v4.4s, v18.4s // t3
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqadd \o1\().4s, v3.4s, v7.4s // out1
+ sqsub v3.4s, v3.4s, v7.4s // t6
+ sqadd \o6\().4s, v5.4s, v19.4s // out6
+ sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+ mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+ mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+ srshr v2.4s, v18.4s, #12 // out3
+ mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+ srshr v3.4s, v20.4s, #12 // out5
+ srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+ srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19)
+
+ sqneg \o3\().4s, v2.4s // out3
+ sqneg \o5\().4s, v3.4s // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ sqshl v16.4s, v16.4s, #1
+ sqshl v17.4s, v17.4s, #1
+ sqshl v18.4s, v18.4s, #1
+ sqshl v19.4s, v19.4s, #1
+ sqshl v20.4s, v20.4s, #1
+ sqshl v21.4s, v21.4s, #1
+ sqshl v22.4s, v22.4s, #1
+ sqshl v23.4s, v23.4s, #1
+ ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+ movi v31.4s, #0
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23
+
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v27.16b
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ movrel x5, X(inv_\txfm2\()_8h_x8_neon)
+ mov w13, #\eob_half
+ adr x4, inv_\txfm1\()_4s_x8_neon
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn v22.4h, v22.4s
+ sqxtn v23.4h, v23.4s
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movz w16, #2896*8, lsl #16
+ movi v31.4s, #0
+ dup v30.2s, w16
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v20.4h, v16.4s
+ sqxtn v21.4h, v17.4s
+ sqxtn v22.4h, v18.4s
+ sqxtn v23.4h, v19.4s
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+
+1:
+.irp i, v20, v21, v22, v23
+ movi \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov w13, #\eob_half
+.endif
+ movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #32
+
+ mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
+ srshr v17.4s, v2.4s, #12 // t8a
+ srshr v31.4s, v3.4s, #12 // t15a
+ mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ srshr v23.4s, v6.4s, #12 // t9a
+ srshr v25.4s, v2.4s, #12 // t14a
+ mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
+ mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
+ srshr v21.4s, v3.4s, #12 // t10a
+ srshr v27.4s, v6.4s, #12 // t13a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ srshr v19.4s, v2.4s, #12 // t11a
+ srshr v29.4s, v3.4s, #12 // t12a
+
+ ld1 {v0.4s}, [x16]
+
+ sqsub v2.4s, v17.4s, v23.4s // t9
+ sqadd v17.4s, v17.4s, v23.4s // t8
+ sqsub v3.4s, v31.4s, v25.4s // t14
+ sqadd v31.4s, v31.4s, v25.4s // t15
+ sqsub v23.4s, v19.4s, v21.4s // t10
+ sqadd v19.4s, v19.4s, v21.4s // t11
+ sqadd v25.4s, v29.4s, v27.4s // t12
+ sqsub v29.4s, v29.4s, v27.4s // t13
+
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
+ mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
+ srshr v21.4s, v7.4s, #12 // t9a
+ srshr v27.4s, v6.4s, #12 // t14a
+
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
+ srshr v29.4s, v7.4s, #12 // t13a
+ neg v6.4s, v6.4s
+ srshr v23.4s, v6.4s, #12 // t10a
+
+ sqsub v2.4s, v17.4s, v19.4s // t11a
+ sqadd v17.4s, v17.4s, v19.4s // t8a
+ sqsub v3.4s, v31.4s, v25.4s // t12a
+ sqadd v31.4s, v31.4s, v25.4s // t15a
+ sqadd v19.4s, v21.4s, v23.4s // t9
+ sqsub v21.4s, v21.4s, v23.4s // t10
+ sqsub v25.4s, v27.4s, v29.4s // t13
+ sqadd v27.4s, v27.4s, v29.4s // t14
+
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
+ mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v2.4s, v2.4s, #12 // t10a
+ srshr v3.4s, v3.4s, #12 // t13a
+
+ sqadd v1.4s, v16.4s, v31.4s // out0
+ sqsub v31.4s, v16.4s, v31.4s // out15
+ mov v16.16b, v1.16b
+ sqadd v23.4s, v30.4s, v17.4s // out7
+ sqsub v1.4s, v30.4s, v17.4s // out8
+ sqadd v17.4s, v18.4s, v27.4s // out1
+ sqsub v30.4s, v18.4s, v27.4s // out14
+ sqadd v18.4s, v20.4s, v3.4s // out2
+ sqsub v29.4s, v20.4s, v3.4s // out13
+ sqadd v3.4s, v28.4s, v19.4s // out6
+ sqsub v25.4s, v28.4s, v19.4s // out9
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
+ sqadd v21.4s, v26.4s, v2.4s // out5
+ sqsub v26.4s, v26.4s, v2.4s // out10
+ mov v24.16b, v1.16b
+ mov v22.16b, v3.16b
+
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel x16, iadst16_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0
+ mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2
+ srshr v16.4s, v2.4s, #12 // t0
+ srshr v31.4s, v4.4s, #12 // t1
+ mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3
+ mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4
+ srshr v18.4s, v6.4s, #12 // t2
+ srshr v29.4s, v2.4s, #12 // t3
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5
+ mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6
+ srshr v20.4s, v4.4s, #12 // t4
+ srshr v27.4s, v6.4s, #12 // t5
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7
+ ld1 {v0.4s, v1.4s}, [x16]
+ movrel x16, idct_coeffs
+ mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8
+ srshr v22.4s, v2.4s, #12 // t6
+ srshr v25.4s, v4.4s, #12 // t7
+ mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9
+ mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10
+ srshr v23.4s, v6.4s, #12 // t8
+ srshr v24.4s, v2.4s, #12 // t9
+ mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11
+ mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12
+ srshr v21.4s, v4.4s, #12 // t10
+ srshr v26.4s, v6.4s, #12 // t11
+ mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13
+ mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14
+ srshr v19.4s, v2.4s, #12 // t12
+ srshr v28.4s, v4.4s, #12 // t13
+ mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15
+ srshr v17.4s, v6.4s, #12 // t14
+ srshr v30.4s, v2.4s, #12 // t15
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v23.4s // t8a
+ sqadd v16.4s, v16.4s, v23.4s // t0a
+ sqsub v3.4s, v31.4s, v24.4s // t9a
+ sqadd v31.4s, v31.4s, v24.4s // t1a
+ sqadd v23.4s, v18.4s, v21.4s // t2a
+ sqsub v18.4s, v18.4s, v21.4s // t10a
+ sqadd v24.4s, v29.4s, v26.4s // t3a
+ sqsub v29.4s, v29.4s, v26.4s // t11a
+ sqadd v21.4s, v20.4s, v19.4s // t4a
+ sqsub v20.4s, v20.4s, v19.4s // t12a
+ sqadd v26.4s, v27.4s, v28.4s // t5a
+ sqsub v27.4s, v27.4s, v28.4s // t13a
+ sqadd v19.4s, v22.4s, v17.4s // t6a
+ sqsub v22.4s, v22.4s, v17.4s // t14a
+ sqadd v28.4s, v25.4s, v30.4s // t7a
+ sqsub v25.4s, v25.4s, v30.4s // t15a
+
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
+ mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
+ mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
+ srshr v17.4s, v4.4s, #12 // t8
+ srshr v30.4s, v6.4s, #12 // t9
+ mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12
+ srshr v18.4s, v2.4s, #12 // t10
+ srshr v29.4s, v4.4s, #12 // t11
+ mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14
+ srshr v27.4s, v6.4s, #12 // t12
+ srshr v20.4s, v2.4s, #12 // t13
+ mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15
+ srshr v25.4s, v4.4s, #12 // t14
+ srshr v22.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t4
+ sqadd v16.4s, v16.4s, v21.4s // t0
+ sqsub v3.4s, v31.4s, v26.4s // t5
+ sqadd v31.4s, v31.4s, v26.4s // t1
+ sqadd v21.4s, v23.4s, v19.4s // t2
+ sqsub v23.4s, v23.4s, v19.4s // t6
+ sqadd v26.4s, v24.4s, v28.4s // t3
+ sqsub v24.4s, v24.4s, v28.4s // t7
+ sqadd v19.4s, v17.4s, v27.4s // t8a
+ sqsub v17.4s, v17.4s, v27.4s // t12a
+ sqadd v28.4s, v30.4s, v20.4s // t9a
+ sqsub v30.4s, v30.4s, v20.4s // t13a
+ sqadd v27.4s, v18.4s, v25.4s // t10a
+ sqsub v18.4s, v18.4s, v25.4s // t14a
+ sqadd v20.4s, v29.4s, v22.4s // t11a
+ sqsub v29.4s, v29.4s, v22.4s // t15a
+
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
+ mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
+ mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
+ srshr v22.4s, v4.4s, #12 // t4a
+ srshr v25.4s, v6.4s, #12 // t5a
+ mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a
+ mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12
+ srshr v24.4s, v2.4s, #12 // t6a
+ srshr v23.4s, v4.4s, #12 // t7a
+ mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13
+ mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14
+ srshr v17.4s, v6.4s, #12 // t12
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15
+ srshr v29.4s, v2.4s, #12 // t13
+ srshr v30.4s, v4.4s, #12 // t14
+ srshr v18.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t2a
+.ifc \o0, v16
+ sqadd \o0\().4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+.else
+ sqadd v4.4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+ mov \o0\().16b, v4.16b
+.endif
+
+ sqsub v3.4s, v29.4s, v18.4s // t15a
+ sqadd \o13\().4s, v29.4s, v18.4s // out13
+ sqadd \o2\().4s, v17.4s, v30.4s // out2
+ sqsub v26.4s, v17.4s, v30.4s // t14a
+
+ sqadd \o1\().4s, v19.4s, v27.4s // out1
+ sqsub v27.4s, v19.4s, v27.4s // t10
+ sqadd \o14\().4s, v28.4s, v20.4s // out14
+ sqsub v20.4s, v28.4s, v20.4s // t11
+
+ sqadd \o3\().4s, v22.4s, v24.4s // out3
+ sqsub v22.4s, v22.4s, v24.4s // t6
+ sqadd \o12\().4s, v25.4s, v23.4s // out12
+ sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
+ sqneg \o3\().4s, \o3\().4s // out3
+
+ mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+ mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+ mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+ srshr v24.4s, v24.4s, #12 // out8
+ srshr v4.4s, v4.4s, #12 // out7
+ srshr v5.4s, v6.4s, #12 // out5
+ mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+ mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+ srshr v26.4s, v6.4s, #12 // out10
+
+ mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+ mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+ mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+ srshr \o4\().4s, v2.4s, #12 // out4
+ srshr v6.4s, v6.4s, #12 // out11
+ srshr v7.4s, v21.4s, #12 // out9
+ srshr \o6\().4s, v22.4s, #12 // out6
+
+.ifc \o8, v23
+ mov \o8\().16b, v24.16b
+ mov \o10\().16b, v26.16b
+.endif
+
+ sqneg \o7\().4s, v4.4s // out7
+ sqneg \o5\().4s, v5.4s // out5
+ sqneg \o11\().4s, v6.4s // out11
+ sqneg \o9\().4s, v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #2*(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4s, v\i\().4s, v0.s[0]
+ sqadd v\i\().4s, v\i\().4s, v\i\().4s
+ sqadd v\i\().4s, v\i\().4s, v2.4s
+.endr
+ ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ srshr v3.4s, v3.4s, #1
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ blr x4
+ sqrshrn v16.4h, v16.4s, #\shift
+ sqrshrn v17.4h, v17.4s, #\shift
+ sqrshrn v18.4h, v18.4s, #\shift
+ sqrshrn v19.4h, v19.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+ sqrshrn2 v17.8h, v21.4s, #\shift
+ sqrshrn2 v18.8h, v22.4s, #\shift
+ sqrshrn2 v19.8h, v23.4s, #\shift
+ sqrshrn v20.4h, v24.4s, #\shift
+ sqrshrn v21.4h, v25.4s, #\shift
+ sqrshrn v22.4h, v26.4s, #\shift
+ sqrshrn v23.4h, v27.4s, #\shift
+ sqrshrn2 v20.8h, v28.4s, #\shift
+ sqrshrn2 v21.8h, v29.4s, #\shift
+ sqrshrn2 v22.8h, v30.4s, #\shift
+ sqrshrn2 v23.8h, v31.4s, #\shift
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+ ldrh w12, [x13], #2
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*16*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ adr x4, inv_\txfm1\()_4s_x16_neon
+ movrel x5, X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_16x16
+.else
+ movrel x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_16x16_identity
+.else
+ movrel x13, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ mov x15, x30
+ movi v4.4s, #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], #16
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ sqrshrn v16.4h, v24.4s, #1
+ sqrshrn v17.4h, v25.4s, #1
+ sqrshrn v18.4h, v26.4s, #1
+ sqrshrn v19.4h, v27.4s, #1
+ sqrshrn2 v16.8h, v28.4s, #1
+ sqrshrn2 v17.8h, v29.4s, #1
+ sqrshrn2 v18.8h, v30.4s, #1
+ sqrshrn2 v19.8h, v31.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #16
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh w12, [x13, #4]
+ mov x15, x30
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v20.4h, v16.4s, #1
+ sqrshrn v21.4h, v17.4s, #1
+ sqrshrn v22.4h, v18.4s, #1
+ sqrshrn v23.4h, v19.4s, #1
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ movi \i, #0
+.endr
+2:
+
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v2.4s}, [x2], x11
+.endr
+ blr x4
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_4x16
+.else
+ movrel x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_4x16_identity2
+.else
+ movrel x13, eob_4x16
+.endif
+.endif
+.else
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ sqrshrn v12.4h, v24.4s, #1
+ sqrshrn v13.4h, v25.4s, #1
+ sqrshrn v14.4h, v26.4s, #1
+ sqrshrn v15.4h, v27.4s, #1
+ sqrshrn2 v12.8h, v28.4s, #1
+ sqrshrn2 v13.8h, v29.4s, #1
+ sqrshrn2 v14.8h, v30.4s, #1
+ sqrshrn2 v15.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+ transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+
+ b 2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+ movi \i, #0
+.endr
+2:
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ movi v4.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ sqrshrn v8.4h, v24.4s, #1
+ sqrshrn v9.4h, v25.4s, #1
+ sqrshrn v10.4h, v26.4s, #1
+ sqrshrn v11.4h, v27.4s, #1
+ sqrshrn2 v8.8h, v28.4s, #1
+ sqrshrn2 v9.8h, v29.4s, #1
+ sqrshrn2 v10.8h, v30.4s, #1
+ sqrshrn2 v11.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+
+ blr x5
+
+ add x0, x0, #16
+ load_add_store_8x8 x0, x7
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
+ ldrh w12, [x13, #4]
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ sqrshrn2 v28.8h, v20.4s, #1
+ sqrshrn2 v29.8h, v21.4s, #1
+ sqrshrn2 v30.8h, v22.4s, #1
+ sqrshrn2 v31.8h, v23.4s, #1
+ transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x20
+
+ ret x15
+endfunc
+
+const eob_8x16
+ .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+ .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+ .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_8x16
+.else
+ movrel x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_8x16_identity2
+.else
+ movrel x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+ ldrh w13, [x13]
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+ movrel x16, idct_coeffs, 4*16
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a
+ mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a
+ mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a
+ srshr v16.4s, v2.4s, #12 // t16a
+ srshr v31.4s, v4.4s, #12 // t31a
+ mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a
+ mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a
+ srshr v24.4s, v6.4s, #12 // t17a
+ srshr v23.4s, v2.4s, #12 // t30a
+ mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a
+ mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a
+ srshr v20.4s, v4.4s, #12 // t18a
+ srshr v27.4s, v6.4s, #12 // t29a
+ mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #4*24
+ mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a
+ srshr v28.4s, v2.4s, #12 // t19a
+ srshr v19.4s, v4.4s, #12 // t28a
+ mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a
+ mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a
+ srshr v18.4s, v6.4s, #12 // t20a
+ srshr v29.4s, v2.4s, #12 // t27a
+ mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a
+ mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a
+ srshr v26.4s, v4.4s, #12 // t21a
+ srshr v21.4s, v6.4s, #12 // t26a
+ mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a
+ mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a
+ srshr v22.4s, v2.4s, #12 // t22a
+ srshr v25.4s, v4.4s, #12 // t25a
+ mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a
+ srshr v30.4s, v6.4s, #12 // t23a
+ srshr v17.4s, v2.4s, #12 // t24a
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v24.4s // t17
+ sqadd v16.4s, v16.4s, v24.4s // t16
+ sqsub v3.4s, v31.4s, v23.4s // t30
+ sqadd v31.4s, v31.4s, v23.4s // t31
+ sqsub v24.4s, v28.4s, v20.4s // t18
+ sqadd v28.4s, v28.4s, v20.4s // t19
+ sqadd v23.4s, v18.4s, v26.4s // t20
+ sqsub v18.4s, v18.4s, v26.4s // t21
+ sqsub v20.4s, v30.4s, v22.4s // t22
+ sqadd v30.4s, v30.4s, v22.4s // t23
+ sqadd v26.4s, v17.4s, v25.4s // t24
+ sqsub v17.4s, v17.4s, v25.4s // t25
+ sqsub v22.4s, v29.4s, v21.4s // t26
+ sqadd v29.4s, v29.4s, v21.4s // t27
+ sqadd v25.4s, v19.4s, v27.4s // t28
+ sqsub v19.4s, v19.4s, v27.4s // t29
+
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
+ mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
+ mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
+ srshr v21.4s, v7.4s, #12 // t17a
+ srshr v27.4s, v6.4s, #12 // t30a
+ neg v2.4s, v2.4s // -> t18a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
+ srshr v19.4s, v2.4s, #12 // t18a
+ srshr v24.4s, v7.4s, #12 // t29a
+ mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ srshr v22.4s, v6.4s, #12 // t21a
+ srshr v18.4s, v2.4s, #12 // t26a
+ neg v7.4s, v7.4s // -> t22a
+ mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
+ srshr v17.4s, v7.4s, #12 // t22a
+ srshr v20.4s, v6.4s, #12 // t25a
+
+ sqsub v2.4s, v27.4s, v24.4s // t29
+ sqadd v27.4s, v27.4s, v24.4s // t30
+ sqsub v3.4s, v21.4s, v19.4s // t18
+ sqadd v21.4s, v21.4s, v19.4s // t17
+ sqsub v24.4s, v16.4s, v28.4s // t19a
+ sqadd v16.4s, v16.4s, v28.4s // t16a
+ sqsub v19.4s, v30.4s, v23.4s // t20a
+ sqadd v30.4s, v30.4s, v23.4s // t23a
+ sqsub v28.4s, v17.4s, v22.4s // t21
+ sqadd v17.4s, v17.4s, v22.4s // t22
+ sqadd v23.4s, v26.4s, v29.4s // t24a
+ sqsub v26.4s, v26.4s, v29.4s // t27a
+ sqadd v22.4s, v20.4s, v18.4s // t25
+ sqsub v20.4s, v20.4s, v18.4s // t26
+ sqsub v29.4s, v31.4s, v25.4s // t28a
+ sqadd v31.4s, v31.4s, v25.4s // t31a
+
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
+ mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
+ mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
+ srshr v18.4s, v7.4s, #12 // t18a
+ srshr v25.4s, v6.4s, #12 // t29a
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
+ srshr v29.4s, v2.4s, #12 // t19
+ srshr v24.4s, v7.4s, #12 // t28
+ neg v6.4s, v6.4s // -> t20
+ mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ srshr v26.4s, v6.4s, #12 // t20
+ srshr v19.4s, v2.4s, #12 // t27
+ neg v7.4s, v7.4s // -> t21a
+ mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
+ srshr v20.4s, v7.4s, #12 // t21a
+ srshr v28.4s, v6.4s, #12 // t26a
+
+ sqsub v2.4s, v16.4s, v30.4s // t23
+ sqadd v16.4s, v16.4s, v30.4s // t16 = out16
+ sqsub v3.4s, v31.4s, v23.4s // t24
+ sqadd v31.4s, v31.4s, v23.4s // t31 = out31
+ sqsub v23.4s, v21.4s, v17.4s // t22a
+ sqadd v17.4s, v21.4s, v17.4s // t17a = out17
+ sqadd v30.4s, v27.4s, v22.4s // t30a = out30
+ sqsub v21.4s, v27.4s, v22.4s // t25a
+ sqsub v27.4s, v18.4s, v20.4s // t21
+ sqadd v18.4s, v18.4s, v20.4s // t18 = out18
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
+ sqsub v26.4s, v29.4s, v26.4s // t20a
+ sqadd v29.4s, v25.4s, v28.4s // t29 = out29
+ sqsub v25.4s, v25.4s, v28.4s // t26
+ sqadd v28.4s, v24.4s, v19.4s // t28a = out28
+ sqsub v24.4s, v24.4s, v19.4s // t27a
+ mov v19.16b, v7.16b // out19
+
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
+ mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
+ srshr v20.4s, v7.4s, #12 // t20
+ srshr v22.4s, v6.4s, #12 // t27
+
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
+ mov v27.16b, v22.16b // t27
+ srshr v26.4s, v7.4s, #12 // t26a
+
+ mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
+ srshr v21.4s, v6.4s, #12 // t21a
+ srshr v22.4s, v24.4s, #12 // t22
+ srshr v25.4s, v7.4s, #12 // t25
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
+ srshr v23.4s, v7.4s, #12 // t23a
+ srshr v24.4s, v6.4s, #12 // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+ lsl x8, x8, #1
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
+ transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5
+
+.macro store1 r0, r1, r2, r3
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ st1 {\r2}, [x6], #16
+ st1 {\r3}, [x6], #16
+.endm
+ store1 v16.4s, v20.4s, v24.4s, v28.4s
+ store1 v17.4s, v21.4s, v25.4s, v29.4s
+ store1 v18.4s, v22.4s, v26.4s, v30.4s
+ store1 v19.4s, v23.4s, v27.4s, v31.4s
+.purgem store1
+ sub x6, x6, #64*4
+
+ movi v7.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+ scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_4s_x16_neon
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+ transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5
+ transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5
+.macro store2 r0, r1, r2, r3, shift
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ sqsub v4.4s, v0.4s, \r0
+ sqadd v0.4s, v0.4s, \r0
+ sqsub v5.4s, v1.4s, \r1
+ sqadd v1.4s, v1.4s, \r1
+ sqsub v6.4s, v2.4s, \r2
+ sqadd v2.4s, v2.4s, \r2
+ sqsub v7.4s, v3.4s, \r3
+ sqadd v3.4s, v3.4s, \r3
+ sqrshrn v0.4h, v0.4s, #\shift
+ sqrshrn2 v0.8h, v1.4s, #\shift
+ sqrshrn v1.4h, v2.4s, #\shift
+ sqrshrn2 v1.8h, v3.4s, #\shift
+ sqrshrn v2.4h, v7.4s, #\shift
+ sqrshrn2 v2.8h, v6.4s, #\shift
+ sqrshrn v3.4h, v5.4s, #\shift
+ sqrshrn2 v3.8h, v4.4s, #\shift
+ st1 {v0.8h, v1.8h}, [x6], #32
+ rev64 v2.8h, v2.8h
+ rev64 v3.8h, v3.8h
+ st1 {v2.8h, v3.8h}, [x6], #32
+.endm
+
+ store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift
+ store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift
+ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift
+ store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl X(inv_dct32_odd_8h_x16_neon)
+
+ neg x9, x8
+ mov x10, x6
+ mvni v1.8h, #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8h}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8h}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8h}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ usqadd v2.8h, v5.8h
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ ld1 {v5.8h}, [x7], \stride
+ usqadd v3.8h, v6.8h
+ smin v2.8h, v2.8h, v1.8h
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8h}, [x6], x1
+ ld1 {v2.8h}, [x10], x1
+ usqadd v4.8h, v7.8h
+ smin v3.8h, v3.8h, v1.8h
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8h}, [x6], x1
+ usqadd v2.8h, v5.8h
+ smin v4.8h, v4.8h, v1.8h
+ st1 {v4.8h}, [x6], x1
+ smin v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ movi v0.8h, #0
+ movi v1.8h, #0
+ movrel x13, eob_32x32, 2
+
+ mov x8, #4*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movz w16, #2896*8, lsl #16
+ movz w17, #2*(5793-4096)*8, lsl #16
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movrel x13, eob_16x32\hshort, 2
+
+ mov x8, #4*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ dup v2.2s, w16
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ mov v2.s[1], w17
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+ // 16x32
+ identity_4x16_shift1 v2.s[1]
+.else
+ // 32x16
+ shift_16_regs sqshl, 1
+ identity_4x16 v2.s[1]
+.endif
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #16
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movi v0.4s, #0
+ movi v1.4s, #0
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ movrel x13, eob_8x32, 2
+
+ mov w8, #4*\h
+1:
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ ldrh w12, [x13], #4
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+
+.if \w == 8
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn2 v16.8h, v17.4s, #1
+ sqrshrn v17.4h, v18.4s, #1
+ sqrshrn2 v17.8h, v19.4s, #1
+ sqrshrn v18.4h, v20.4s, #1
+ sqrshrn2 v18.8h, v21.4s, #1
+ sqrshrn v19.4h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ sqrshrn v20.4h, v24.4s, #1
+ sqrshrn2 v20.8h, v25.4s, #1
+ sqrshrn v21.4h, v26.4s, #1
+ sqrshrn2 v21.8h, v27.4s, #1
+ sqrshrn v22.4h, v28.4s, #1
+ sqrshrn2 v22.8h, v29.4s, #1
+ sqrshrn v23.4h, v30.4s, #1
+ sqrshrn2 v23.8h, v31.4s, #1
+.else
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #4*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ movrel x13, eob_16x32
+ movrel x5, X(inv_dct_8h_x16_neon)
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.4s, #0
+ mov x8, #4*32
+ mov w9, #32
+ mov x6, sp
+ mov x7, x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().4s}, [x7]
+ st1 {v28.4s}, [x7], x8
+.endr
+ ldrh w12, [x13], #2
+ sub w9, w9, #4
+ sub x7, x7, x8, lsl #3
+ add x7, x7, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn v17.4h, v17.4s, #2
+ sqrshrn v18.4h, v18.4s, #2
+ sqrshrn v19.4h, v19.4s, #2
+ sqrshrn2 v16.8h, v20.4s, #2
+ sqrshrn2 v17.8h, v21.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+ sqrshrn2 v19.8h, v23.4s, #2
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ cmp w3, w12
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #4
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+.irp i, 0, 4
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ cmp w3, #10
+ b.lt 1f
+.endif
+ mov x8, #8*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9, lsl #1
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.4s, v1.4s}, [x17], #32
+
+ sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a
+ sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a
+ sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a
+ sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a
+ sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a
+ sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a
+ sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a
+ sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a
+
+ ld1 {v0.4s}, [x17], #16
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t33
+ sqsub v26.4s, v19.4s, v18.4s // t34
+ sqadd v27.4s, v19.4s, v18.4s // t35
+ sqadd v28.4s, v20.4s, v21.4s // t60
+ sqsub v29.4s, v20.4s, v21.4s // t61
+ sqsub v30.4s, v23.4s, v22.4s // t62
+ sqadd v31.4s, v23.4s, v22.4s // t63
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ neg v2.4s, v2.4s // t34a
+ mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
+ srshr v26.4s, v2.4s, #12 // t34a
+ mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
+ srshr v29.4s, v7.4s, #12 // t61a
+ srshr v25.4s, v6.4s, #12 // t33a
+ srshr v30.4s, v2.4s, #12 // t62a
+
+ sqadd v16.4s, v24.4s, v27.4s // t32a
+ sqsub v19.4s, v24.4s, v27.4s // t35a
+ sqadd v17.4s, v25.4s, v26.4s // t33
+ sqsub v18.4s, v25.4s, v26.4s // t34
+ sqsub v20.4s, v31.4s, v28.4s // t60a
+ sqadd v23.4s, v31.4s, v28.4s // t63a
+ sqsub v21.4s, v30.4s, v29.4s // t61
+ sqadd v22.4s, v30.4s, v29.4s // t62
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
+ srshr v21.4s, v2.4s, #12 // t61a
+ srshr v18.4s, v7.4s, #12 // t34a
+ mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
+ srshr v20.4s, v6.4s, #12 // t60
+ srshr v19.4s, v2.4s, #12 // t35
+
+ st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+ st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #4*4*0] // t32a
+ ldr q17, [x9, #4*4*8] // t39a
+ ldr q18, [x9, #4*4*0] // t63a
+ ldr q19, [x6, #4*4*8] // t56a
+ ldr q20, [x6, #4*4*16] // t40a
+ ldr q21, [x9, #4*4*24] // t47a
+ ldr q22, [x9, #4*4*16] // t55a
+ ldr q23, [x6, #4*4*24] // t48a
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t39
+ sqadd v26.4s, v18.4s, v19.4s // t63
+ sqsub v27.4s, v18.4s, v19.4s // t56
+ sqsub v28.4s, v21.4s, v20.4s // t40
+ sqadd v29.4s, v21.4s, v20.4s // t47
+ sqadd v30.4s, v23.4s, v22.4s // t48
+ sqsub v31.4s, v23.4s, v22.4s // t55
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
+ srshr v25.4s, v2.4s, #12 // t56a
+ srshr v27.4s, v7.4s, #12 // t39a
+ neg v6.4s, v6.4s // t40a
+ mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
+ srshr v31.4s, v6.4s, #12 // t40a
+ srshr v28.4s, v2.4s, #12 // t55a
+
+ sqadd v16.4s, v24.4s, v29.4s // t32a
+ sqsub v19.4s, v24.4s, v29.4s // t47a
+ sqadd v17.4s, v27.4s, v31.4s // t39
+ sqsub v18.4s, v27.4s, v31.4s // t40
+ sqsub v20.4s, v26.4s, v30.4s // t48a
+ sqadd v23.4s, v26.4s, v30.4s // t63a
+ sqsub v21.4s, v25.4s, v28.4s // t55
+ sqadd v22.4s, v25.4s, v28.4s // t56
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
+ srshr v18.4s, v2.4s, #12 // t40a
+ srshr v21.4s, v7.4s, #12 // t55a
+ mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
+ srshr v19.4s, v6.4s, #12 // t47
+ srshr v20.4s, v2.4s, #12 // t48
+
+ str q16, [x6, #4*4*0] // t32a
+ str q17, [x9, #4*4*0] // t39
+ str q18, [x6, #4*4*8] // t40a
+ str q19, [x9, #4*4*8] // t47
+ str q20, [x6, #4*4*16] // t48
+ str q21, [x9, #4*4*16] // t55a
+ str q22, [x6, #4*4*24] // t56
+ str q23, [x9, #4*4*24] // t63a
+
+ add x6, x6, #4*4
+ sub x9, x9, #4*4
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+ movz \gpr, \val, lsl #16
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ store16 x6
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_4s_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.4s}, [x6], #16
+ ld1 {v3.4s}, [x6], #16
+ sqadd v6.4s, v2.4s, \r0
+ sqsub \r0, v2.4s, \r0
+ ld1 {v4.4s}, [x6], #16
+ sqadd v7.4s, v3.4s, \r1
+ sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
+ ld1 {v5.4s}, [x6], #16
+ sqadd v2.4s, v4.4s, \r2
+ sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
+ sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
+ st1 {v6.4s}, [x6], #16
+ st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
+ sqadd v3.4s, v5.4s, \r3
+ sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
+ st1 {v7.4s}, [x6], #16
+ st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
+ st1 {v2.4s}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.4s}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.4s, v30.4s, v29.4s, v28.4s
+ store_addsub v27.4s, v26.4s, v25.4s, v24.4s
+ store_addsub v23.4s, v22.4s, v21.4s, v20.4s
+ store_addsub v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+ add x6, x6, #4*4*16
+
+ movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.4s}, [x7] // in1 (offset 0)
+ ld1 {v17.4s}, [x9] // in31 (offset 15)
+ ld1 {v18.4s}, [x10] // in17 (offset 8)
+ ld1 {v19.4s}, [x11] // in15 (offset 7)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.4s}, [x10] // in7 (offset 3)
+ ld1 {v17.4s}, [x11] // in25 (offset 12)
+ ld1 {v18.4s}, [x9] // in23 (offset 11)
+ ld1 {v19.4s}, [x7] // in9 (offset 4)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #4*4*32
+ add x9, x6, #4*4*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #4*4*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-4*4*4
+
+ dup v7.4s, w12
+1:
+ ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+ ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.4s, \src0, \src1
+ sqadd v0.4s, \src0, \src1
+ sqsub v3.4s, \src2, \src3
+ srshl v1.4s, v1.4s, v7.4s
+ sqadd v2.4s, \src2, \src3
+ srshl v3.4s, v3.4s, v7.4s
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v2.4s, v2.4s, v7.4s
+ sqxtn v3.4h, v3.4s
+ sqxtn2 v3.8h, v1.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v2.4s
+ rev64 v3.8h, v3.8h
+ st1 {v0.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.4s, v31.4s, v20.4s, v27.4s
+ store_addsub v17.4s, v30.4s, v21.4s, v26.4s
+ store_addsub v18.4s, v29.4s, v22.4s, v25.4s
+ store_addsub v19.4s, v28.4s, v23.4s, v24.4s
+.purgem store_addsub
+ sub x6, x6, x10, lsl #2
+ sub x9, x9, x10, lsl #2
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8h}, [x6], x1
+ ld1 {v1.8h}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8h}, [x6]
+ sqsub \src0, \src0, \src1
+ ld1 {v3.8h}, [x9]
+ sqadd v5.8h, \src2, \src3
+ sqsub \src2, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr \src0, \src0, #4
+ usqadd v0.8h, v4.8h
+ srshr \src2, \src2, #4
+ usqadd v1.8h, \src0
+ usqadd v2.8h, v5.8h
+ smin v0.8h, v0.8h, v7.8h
+ usqadd v3.8h, \src2
+ smin v1.8h, v1.8h, v7.8h
+ st1 {v0.8h}, [x6], x1
+ smin v2.8h, v2.8h, v7.8h
+ st1 {v1.8h}, [x9], x10
+ smin v3.8h, v3.8h, v7.8h
+ st1 {v2.8h}, [x6], x1
+ st1 {v3.8h}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*4*4
+ add x4, sp, #64*4*4
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ movrel x5, X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/loopfilter.S b/third_party/dav1d/src/arm/64/loopfilter.S
new file mode 100644
index 0000000000..63d5de10ad
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@@ -0,0 +1,1129 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_16_wd\wd\()_neon
+ uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0)
+ uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
+ uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
+ uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.16b, v4.16b, v5.16b
+.endif
+ uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ ushr v3.16b, v3.16b, #1
+.if \wd >= 8
+ umax v4.16b, v4.16b, v6.16b
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.16b, v0.16b, v4.16b
+ cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.16b, #1
+ uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
+ uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0)
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0)
+ uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0)
+.endif
+ umax v2.16b, v2.16b, v3.16b
+ umax v4.16b, v4.16b, v5.16b
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ umax v2.16b, v2.16b, v4.16b
+.if \wd >= 8
+ umax v2.16b, v2.16b, v6.16b
+.endif
+
+.if \wd == 16
+ uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0)
+ uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0)
+ uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0)
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // flat8in
+.if \wd == 16
+ uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0)
+ uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0)
+ uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.16b, v3.16b, v4.16b
+ umax v5.16b, v5.16b, v6.16b
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.16b, v7.16b, v8.16b
+ umax v3.16b, v3.16b, v5.16b
+ umax v3.16b, v3.16b, v7.16b
+ cmhs v3.16b, v10.16b, v3.16b // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+ movi v3.16b, #128
+ eor v2.16b, v22.16b, v3.16b // p1 - 128
+ eor v3.16b, v25.16b, v3.16b // q1 - 128
+ cmhi v0.16b, v0.16b, v12.16b // hev
+ sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ usubl v2.8h, v24.8b, v23.8b
+ movi v5.8h, #3
+ usubl2 v3.8h, v24.16b, v23.16b
+ mul v2.8h, v2.8h, v5.8h
+ mul v3.8h, v3.8h, v5.8h
+ movi v6.16b, #4
+ saddw v2.8h, v2.8h, v4.8b
+ saddw2 v3.8h, v3.8h, v4.16b
+ movi v7.16b, #3
+ sqxtn v2.8b, v2.8h // f
+ sqxtn2 v2.16b, v3.8h
+ sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
+ sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
+ sshr v4.16b, v4.16b, #3 // f1
+ sshr v5.16b, v5.16b, #3 // f2
+ mov v2.16b, v23.16b // p0
+ mov v3.16b, v24.16b // q0
+ neg v6.16b, v4.16b // -f1
+ srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
+ // p0 + f2, q0 - f1
+ usqadd v2.16b, v5.16b // out p0
+ usqadd v3.16b, v6.16b // out q0
+ neg v6.16b, v4.16b // -((f1 + 1) >> 1)
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4)
+ mov v2.16b, v22.16b // p1
+ mov v3.16b, v25.16b // q1
+ // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
+ usqadd v2.16b, v4.16b // out p1
+ usqadd v3.16b, v6.16b // out q1
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ uaddl v0.8h, v21.8b, v21.8b // p2 * 2
+ uaddl2 v1.8h, v21.16b, v21.16b
+ uaddl v2.8h, v21.8b, v22.8b // p2 + p1
+ uaddl2 v3.8h, v21.16b, v22.16b
+ uaddl v4.8h, v22.8b, v23.8b // p1 + p0
+ uaddl2 v5.8h, v22.16b, v23.16b
+ uaddl v6.8h, v23.8b, v24.8b // p0 + q0
+ uaddl2 v7.8h, v23.16b, v24.16b
+ add v8.8h, v0.8h, v2.8h
+ add v9.8h, v1.8h, v3.8h
+ add v10.8h, v4.8h, v6.8h
+ add v11.8h, v5.8h, v7.8h
+ uaddl v12.8h, v24.8b, v25.8b // q0 + q1
+ uaddl2 v13.8h, v24.16b, v25.16b
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v0.8h
+ sub v13.8h, v13.8h, v1.8h
+ uaddl v10.8h, v25.8b, v26.8b // q1 + q2
+ uaddl2 v11.8h, v25.16b, v26.16b
+ rshrn v0.8b, v8.8h, #3 // out p1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ sub v10.8h, v10.8h, v2.8h
+ sub v11.8h, v11.8h, v3.8h
+ uaddl v12.8h, v26.8b, v26.8b // q2 + q2
+ uaddl2 v13.8h, v26.16b, v26.16b
+ rshrn v1.8b, v8.8h, #3 // out p0
+ rshrn2 v1.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v4.8h
+ sub v13.8h, v13.8h, v5.8h
+ rshrn v2.8b, v8.8h, #3 // out q0
+ rshrn2 v2.16b, v9.8h, #3
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ rshrn v3.8b, v8.8h, #3 // out q1
+ rshrn2 v3.16b, v9.8h, #3
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ uaddl v0.8h, v20.8b, v21.8b // p3 + p2
+ uaddl2 v1.8h, v20.16b, v21.16b
+ uaddl v2.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v3.8h, v22.16b, v25.16b
+ uaddl v4.8h, v20.8b, v22.8b // p3 + p1
+ uaddl2 v5.8h, v20.16b, v22.16b
+ uaddl v6.8h, v23.8b, v26.8b // p0 + q2
+ uaddl2 v7.8h, v23.16b, v26.16b
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v1.8h, v1.8h
+ uaddw v8.8h, v8.8h, v23.8b // + p0
+ uaddw2 v9.8h, v9.8h, v23.16b
+ uaddw v8.8h, v8.8h, v24.8b // + q0
+ uaddw2 v9.8h, v9.8h, v24.16b
+ add v8.8h, v8.8h, v4.8h
+ add v9.8h, v9.8h, v5.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ sub v3.8h, v3.8h, v1.8h
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ sub v7.8h, v7.8h, v5.8h
+ rshrn v10.8b, v8.8h, #3 // out p2
+ rshrn2 v10.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ uaddl v0.8h, v20.8b, v23.8b // p3 + p0
+ uaddl2 v1.8h, v20.16b, v23.16b
+ uaddl v2.8h, v24.8b, v27.8b // q0 + q3
+ uaddl2 v3.8h, v24.16b, v27.16b
+ rshrn v11.8b, v8.8h, #3 // out p1
+ rshrn2 v11.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ sub v3.8h, v3.8h, v1.8h
+ uaddl v4.8h, v21.8b, v24.8b // p2 + q0
+ uaddl2 v5.8h, v21.16b, v24.16b
+ uaddl v6.8h, v25.8b, v27.8b // q1 + q3
+ uaddl2 v7.8h, v25.16b, v27.16b
+ rshrn v12.8b, v8.8h, #3 // out p0
+ rshrn2 v12.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ sub v7.8h, v7.8h, v5.8h
+ uaddl v0.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v1.8h, v22.16b, v25.16b
+ uaddl v2.8h, v26.8b, v27.8b // q2 + q3
+ uaddl2 v3.8h, v26.16b, v27.16b
+ rshrn v13.8b, v8.8h, #3 // out q0
+ rshrn2 v13.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ sub v3.8h, v3.8h, v1.8h
+ rshrn v0.8b, v8.8h, #3 // out q1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h , v9.8h, v3.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ rshrn v1.8b, v8.8h, #3 // out q2
+ rshrn2 v1.16b, v9.8h, #3
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ uaddl v2.8h, v17.8b, v17.8b // p6 + p6
+ uaddl2 v3.8h, v17.16b, v17.16b
+ uaddl v4.8h, v17.8b, v18.8b // p6 + p5
+ uaddl2 v5.8h, v17.16b, v18.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ add v12.8h, v2.8h, v4.8h
+ add v13.8h, v3.8h, v5.8h
+ add v10.8h, v6.8h, v8.8h
+ add v11.8h, v7.8h, v9.8h
+ uaddl v6.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v7.8h, v17.16b, v21.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ uaddl v8.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v9.8h, v17.16b, v22.16b
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ add v6.8h, v6.8h, v8.8h
+ add v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v9.8h, v19.16b, v24.16b
+ add v12.8h, v12.8h, v6.8h
+ add v13.8h, v13.8h, v7.8h
+ add v10.8h, v10.8h, v8.8h
+ add v11.8h, v11.8h, v9.8h
+ uaddl v6.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v7.8h, v20.16b, v25.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ sub v6.8h, v6.8h, v2.8h
+ sub v7.8h, v7.8h, v3.8h
+ uaddl v2.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v3.8h, v21.16b, v26.16b
+ rshrn v0.8b, v12.8h, #4 // out p5
+ rshrn2 v0.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ add v13.8h, v13.8h, v7.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v5.8h
+ uaddl v4.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v5.8h, v22.16b, v27.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ rshrn v1.8b, v12.8h, #4 // out p4
+ rshrn2 v1.16b, v13.8h, #4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ add v13.8h, v13.8h, v3.8h
+ sub v4.8h, v4.8h, v6.8h
+ sub v5.8h, v5.8h, v7.8h
+ uaddl v6.8h, v23.8b, v28.8b // p0 + q4
+ uaddl2 v7.8h, v23.16b, v28.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ rshrn v2.8b, v12.8h, #4 // out p3
+ rshrn2 v2.16b, v13.8h, #4
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ add v13.8h, v13.8h, v5.8h
+ sub v6.8h, v6.8h, v8.8h
+ sub v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v24.8b, v29.8b // q0 + q5
+ uaddl2 v9.8h, v24.16b, v29.16b
+ uaddl v4.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v5.8h, v17.16b, v21.16b
+ rshrn v3.8b, v12.8h, #4 // out p2
+ rshrn2 v3.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v4.8h
+ sub v9.8h, v9.8h, v5.8h
+ uaddl v6.8h, v25.8b, v30.8b // q1 + q6
+ uaddl2 v7.8h, v25.16b, v30.16b
+ uaddl v10.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v11.8h, v17.16b, v22.16b
+ rshrn v4.8b, v12.8h, #4 // out p1
+ rshrn2 v4.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ add v13.8h, v13.8h, v9.8h
+ sub v6.8h, v6.8h, v10.8h
+ sub v7.8h, v7.8h, v11.8h
+ uaddl v8.8h, v26.8b, v30.8b // q2 + q6
+ uaddl2 v9.8h, v26.16b, v30.16b
+ bif v0.16b, v18.16b, v15.16b // out p5
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ rshrn v5.8b, v12.8h, #4 // out p0
+ rshrn2 v5.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v10.8h
+ sub v9.8h, v9.8h, v11.8h
+ uaddl v10.8h, v27.8b, v30.8b // q3 + q6
+ uaddl2 v11.8h, v27.16b, v30.16b
+ bif v1.16b, v19.16b, v15.16b // out p4
+ uaddl v18.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v19.8h, v19.16b, v24.16b
+ rshrn v6.8b, v12.8h, #4 // out q0
+ rshrn2 v6.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ add v13.8h, v13.8h, v9.8h
+ sub v10.8h, v10.8h, v18.8h
+ sub v11.8h, v11.8h, v19.8h
+ uaddl v8.8h, v28.8b, v30.8b // q4 + q6
+ uaddl2 v9.8h, v28.16b, v30.16b
+ bif v2.16b, v20.16b, v15.16b // out p3
+ uaddl v18.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v19.8h, v20.16b, v25.16b
+ rshrn v7.8b, v12.8h, #4 // out q1
+ rshrn2 v7.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v8.8h, v18.8h
+ sub v19.8h, v9.8h, v19.8h
+ uaddl v10.8h, v29.8b, v30.8b // q5 + q6
+ uaddl2 v11.8h, v29.16b, v30.16b
+ bif v3.16b, v21.16b, v15.16b // out p2
+ uaddl v20.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v21.8h, v21.16b, v26.16b
+ rshrn v8.8b, v12.8h, #4 // out q2
+ rshrn2 v8.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ add v13.8h, v13.8h, v19.8h
+ sub v10.8h, v10.8h, v20.8h
+ sub v11.8h, v11.8h, v21.8h
+ uaddl v18.8h, v30.8b, v30.8b // q6 + q6
+ uaddl2 v19.8h, v30.16b, v30.16b
+ bif v4.16b, v22.16b, v15.16b // out p1
+ uaddl v20.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v21.8h, v22.16b, v27.16b
+ rshrn v9.8b, v12.8h, #4 // out q3
+ rshrn2 v9.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v18.8h, v20.8h
+ sub v19.8h, v19.8h, v21.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ rshrn v10.8b, v12.8h, #4 // out q4
+ rshrn2 v10.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ add v13.8h, v13.8h, v19.8h
+ rshrn v11.8b, v12.8h, #4 // out q5
+ rshrn2 v11.16b, v13.8h, #4
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_16_wd16
+ bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd8
+ bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd6
+ bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd4
+ bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_16_neon
+ mov x15, x30
+ sub x16, x0, #2
+ add x0, x16, x1, lsl #3
+ ld1 {v22.s}[0], [x16], x1
+ ld1 {v22.s}[2], [x0], x1
+ ld1 {v23.s}[0], [x16], x1
+ ld1 {v23.s}[2], [x0], x1
+ ld1 {v24.s}[0], [x16], x1
+ ld1 {v24.s}[2], [x0], x1
+ ld1 {v25.s}[0], [x16], x1
+ ld1 {v25.s}[2], [x0], x1
+ ld1 {v22.s}[1], [x16], x1
+ ld1 {v22.s}[3], [x0], x1
+ ld1 {v23.s}[1], [x16], x1
+ ld1 {v23.s}[3], [x0], x1
+ ld1 {v24.s}[1], [x16], x1
+ ld1 {v24.s}[3], [x0], x1
+ ld1 {v25.s}[1], [x16], x1
+ ld1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_6_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_8_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v27.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_16_16_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.16b}, [x16], x1 // p6
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v18.16b}, [x16], x1 // p5
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v19.16b}, [x16], x1 // p4
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v27.16b}, [x0], x1 // q3
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v28.16b}, [x0], x1 // q4
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v29.16b}, [x0], x1 // q5
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v30.16b}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_16_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.16b}, [x16], x1 // p5
+ st1 {v6.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x16], x1 // p4
+ st1 {v7.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x16], x1 // p3
+ st1 {v8.16b}, [x0], x1 // q2
+ st1 {v3.16b}, [x16], x1 // p2
+ st1 {v9.16b}, [x0], x1 // q3
+ st1 {v4.16b}, [x16], x1 // p1
+ st1 {v10.16b}, [x0], x1 // q4
+ st1 {v5.16b}, [x16], x1 // p0
+ st1 {v11.16b}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_16_neon
+ mov x15, x30
+ sub x16, x0, #8
+ ld1 {v16.d}[0], [x16], x1
+ ld1 {v24.d}[0], [x0], x1
+ ld1 {v17.d}[0], [x16], x1
+ ld1 {v25.d}[0], [x0], x1
+ ld1 {v18.d}[0], [x16], x1
+ ld1 {v26.d}[0], [x0], x1
+ ld1 {v19.d}[0], [x16], x1
+ ld1 {v27.d}[0], [x0], x1
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v28.d}[0], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v29.d}[0], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v30.d}[0], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v31.d}[0], [x0], x1
+ ld1 {v16.d}[1], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v17.d}[1], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v19.d}[1], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x16], x1
+ ld1 {v28.d}[1], [x0], x1
+ ld1 {v21.d}[1], [x16], x1
+ ld1 {v29.d}[1], [x0], x1
+ ld1 {v22.d}[1], [x16], x1
+ ld1 {v30.d}[1], [x0], x1
+ ld1 {v23.d}[1], [x16], x1
+ ld1 {v31.d}[1], [x0], x1
+
+ transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_16_wd16
+
+ sub x0, x0, x1, lsl #4
+ sub x16, x0, #8
+
+ transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.d}[0], [x16], x1
+ st1 {v6.d}[0], [x0], x1
+ st1 {v17.d}[0], [x16], x1
+ st1 {v7.d}[0], [x0], x1
+ st1 {v0.d}[0], [x16], x1
+ st1 {v8.d}[0], [x0], x1
+ st1 {v1.d}[0], [x16], x1
+ st1 {v9.d}[0], [x0], x1
+ st1 {v2.d}[0], [x16], x1
+ st1 {v10.d}[0], [x0], x1
+ st1 {v3.d}[0], [x16], x1
+ st1 {v11.d}[0], [x0], x1
+ st1 {v4.d}[0], [x16], x1
+ st1 {v30.d}[0], [x0], x1
+ st1 {v5.d}[0], [x16], x1
+ st1 {v31.d}[0], [x0], x1
+ st1 {v16.d}[1], [x16], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v17.d}[1], [x16], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x16], x1
+ st1 {v8.d}[1], [x0], x1
+ st1 {v1.d}[1], [x16], x1
+ st1 {v9.d}[1], [x0], x1
+ st1 {v2.d}[1], [x16], x1
+ st1 {v10.d}[1], [x0], x1
+ st1 {v3.d}[1], [x16], x1
+ st1 {v11.d}[1], [x0], x1
+ st1 {v4.d}[1], [x16], x1
+ st1 {v30.d}[1], [x0], x1
+ st1 {v5.d}[1], [x16], x1
+ st1 {v31.d}[1], [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ mov x11, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x0f
+.ifc \dir, v
+ ld1 {v0.16b}, [x4], #16
+ ld1 {v1.16b}, [x3], #16
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+ ld2 {v0.s,v1.s}[2], [x3], x4
+ ld2 {v0.s,v1.s}[3], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.16b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.4s, #0xff
+ dup v13.4s, w6 // vmask[0]
+
+ and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word
+ and v1.16b, v1.16b, v2.16b
+ cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0]
+ movi v4.16b, #1
+ ld1r {v6.16b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.4s, v1.4s, v2.4s // L != 0
+ mul v1.4s, v1.4s, v4.4s // L
+.ifc \type, y
+ dup v15.4s, w2 // vmask[2]
+.endif
+ dup v14.4s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ mov x17, v2.d[1]
+ adds x16, x16, x17
+ b.eq 7f // if (!L) continue;
+ neg v5.16b, v5.16b // -sharp[0]
+ movrel x16, word_1248
+ ushr v12.16b, v1.16b, #4 // H
+ ld1 {v16.4s}, [x16]
+ sshl v3.16b, v1.16b, v5.16b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits)
+.endif
+ movi v7.16b, #2
+ umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1])
+ add v0.16b, v1.16b, v7.16b // L + 2
+ umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I
+ add v0.16b, v0.16b, v0.16b // 2*(L + 2)
+ cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits)
+ add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E
+ cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits)
+ and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst w2, #0x0f
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_16_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x0f
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_16_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_16_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_16_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #4
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #4 // vmask[0] >>= 4
+ lsr w7, w7, #4 // vmask[1] >>= 4
+.ifc \type, y
+ lsr w2, w2, #4 // vmask[2] >>= 4
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_1248
+ .word 1, 2, 4, 8
+endconst
diff --git a/third_party/dav1d/src/arm/64/loopfilter16.S b/third_party/dav1d/src/arm/64/loopfilter16.S
new file mode 100644
index 0000000000..d181a3e623
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@@ -0,0 +1,925 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
+ uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
+ uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
+ uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.8h, v4.8h, v5.8h
+.endif
+ uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ ushr v3.8h, v3.8h, #1
+.if \wd >= 8
+ umax v4.8h, v4.8h, v6.8h
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.8h, v0.8h, v4.8h
+ cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.8h, #1
+ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
+ uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
+ dup v9.8h, w9 // bitdepth_min_8
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
+ uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
+.endif
+ umax v2.8h, v2.8h, v3.8h
+ umax v4.8h, v4.8h, v5.8h
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ umax v2.8h, v2.8h, v4.8h
+ ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ umax v2.8h, v2.8h, v6.8h
+.endif
+
+.if \wd == 16
+ uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
+ uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
+ uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // flat8in
+.if \wd == 16
+ uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
+ uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
+ uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.8h, v3.8h, v4.8h
+ umax v5.8h, v5.8h, v6.8h
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.8h, v7.8h, v8.8h
+ umax v3.8h, v3.8h, v5.8h
+ umax v3.8h, v3.8h, v7.8h
+ cmhs v3.8h, v10.8h, v3.8h // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+
+ dup v3.8h, w8 // bitdepth_max
+ sub v2.8h, v22.8h, v25.8h // p1 - q1
+ ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
+ cmhi v0.8h, v0.8h, v12.8h // hev
+ not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
+ smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
+ smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ sub v2.8h, v24.8h, v23.8h
+ movi v5.8h, #3
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ mul v2.8h, v2.8h, v5.8h
+ movi v6.8h, #4
+ add v2.8h, v2.8h, v4.8h
+ smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
+ smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
+ sqadd v4.8h, v6.8h, v2.8h // f + 4
+ sqadd v5.8h, v5.8h, v2.8h // f + 3
+ smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ sshr v4.8h, v4.8h, #3 // f1
+ sshr v5.8h, v5.8h, #3 // f2
+ movi v9.8h, #0
+ dup v3.8h, w8 // bitdepth_max
+ sqadd v2.8h, v23.8h, v5.8h // p0 + f2
+ sqsub v6.8h, v24.8h, v4.8h // q0 - f1
+ srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
+ smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
+ sqadd v2.8h, v22.8h, v4.8h // p1 + f
+ sqsub v6.8h, v25.8h, v4.8h // q1 - f
+ smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ add v0.8h, v21.8h, v21.8h // p2 * 2
+ add v2.8h, v21.8h, v22.8h // p2 + p1
+ add v4.8h, v22.8h, v23.8h // p1 + p0
+ add v6.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v0.8h, v2.8h
+ add v10.8h, v4.8h, v6.8h
+ add v12.8h, v24.8h, v25.8h // q0 + q1
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v0.8h
+ add v10.8h, v25.8h, v26.8h // q1 + q2
+ urshr v0.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v12.8h
+ sub v10.8h, v10.8h, v2.8h
+ add v12.8h, v26.8h, v26.8h // q2 + q2
+ urshr v1.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v4.8h
+ urshr v2.8h, v8.8h, #3 // out q0
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ urshr v3.8h, v8.8h, #3 // out q1
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ add v0.8h, v20.8h, v21.8h // p3 + p2
+ add v2.8h, v22.8h, v25.8h // p1 + q1
+ add v4.8h, v20.8h, v22.8h // p3 + p1
+ add v6.8h, v23.8h, v26.8h // p0 + q2
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v8.8h, v4.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ add v8.8h, v8.8h, v9.8h // + p0 + q0
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ urshr v10.8h, v8.8h, #3 // out p2
+
+ add v8.8h, v8.8h, v2.8h
+ add v0.8h, v20.8h, v23.8h // p3 + p0
+ add v2.8h, v24.8h, v27.8h // q0 + q3
+ urshr v11.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ add v4.8h, v21.8h, v24.8h // p2 + q0
+ add v6.8h, v25.8h, v27.8h // q1 + q3
+ urshr v12.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v2.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ add v0.8h, v22.8h, v25.8h // p1 + q1
+ add v2.8h, v26.8h, v27.8h // q2 + q3
+ urshr v13.8h, v8.8h, #3 // out q0
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ urshr v0.8h, v8.8h, #3 // out q1
+
+ add v8.8h, v8.8h, v2.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ urshr v1.8h, v8.8h, #3 // out q2
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ add v2.8h, v17.8h, v17.8h // p6 + p6
+ add v4.8h, v17.8h, v18.8h // p6 + p5
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ add v12.8h, v2.8h, v4.8h
+ add v10.8h, v6.8h, v8.8h
+ add v6.8h, v17.8h, v21.8h // p6 + p2
+ add v12.8h, v12.8h, v10.8h
+ add v8.8h, v17.8h, v22.8h // p6 + p1
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ add v6.8h, v6.8h, v8.8h
+ add v8.8h, v19.8h, v24.8h // p4 + q0
+ add v12.8h, v12.8h, v6.8h
+ add v10.8h, v10.8h, v8.8h
+ add v6.8h, v20.8h, v25.8h // p3 + q1
+ add v12.8h, v12.8h, v10.8h
+ sub v6.8h, v6.8h, v2.8h
+ add v2.8h, v21.8h, v26.8h // p2 + q2
+ urshr v0.8h, v12.8h, #4 // out p5
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ sub v2.8h, v2.8h, v4.8h
+ add v4.8h, v22.8h, v27.8h // p1 + q3
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ urshr v1.8h, v12.8h, #4 // out p4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ sub v4.8h, v4.8h, v6.8h
+ add v6.8h, v23.8h, v28.8h // p0 + q4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ urshr v2.8h, v12.8h, #4 // out p3
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ sub v6.8h, v6.8h, v8.8h
+ add v8.8h, v24.8h, v29.8h // q0 + q5
+ add v4.8h, v17.8h, v21.8h // p6 + p2
+ urshr v3.8h, v12.8h, #4 // out p2
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ sub v8.8h, v8.8h, v4.8h
+ add v6.8h, v25.8h, v30.8h // q1 + q6
+ add v10.8h, v17.8h, v22.8h // p6 + p1
+ urshr v4.8h, v12.8h, #4 // out p1
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ sub v6.8h, v6.8h, v10.8h
+ add v8.8h, v26.8h, v30.8h // q2 + q6
+ bif v0.16b, v18.16b, v15.16b // out p5
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ urshr v5.8h, v12.8h, #4 // out p0
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ sub v8.8h, v8.8h, v10.8h
+ add v10.8h, v27.8h, v30.8h // q3 + q6
+ bif v1.16b, v19.16b, v15.16b // out p4
+ add v18.8h, v19.8h, v24.8h // p4 + q0
+ urshr v6.8h, v12.8h, #4 // out q0
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ sub v10.8h, v10.8h, v18.8h
+ add v8.8h, v28.8h, v30.8h // q4 + q6
+ bif v2.16b, v20.16b, v15.16b // out p3
+ add v18.8h, v20.8h, v25.8h // p3 + q1
+ urshr v7.8h, v12.8h, #4 // out q1
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ sub v18.8h, v8.8h, v18.8h
+ add v10.8h, v29.8h, v30.8h // q5 + q6
+ bif v3.16b, v21.16b, v15.16b // out p2
+ add v20.8h, v21.8h, v26.8h // p2 + q2
+ urshr v8.8h, v12.8h, #4 // out q2
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ sub v10.8h, v10.8h, v20.8h
+ add v18.8h, v30.8h, v30.8h // q6 + q6
+ bif v4.16b, v22.16b, v15.16b // out p1
+ add v20.8h, v22.8h, v27.8h // p1 + q3
+ urshr v9.8h, v12.8h, #4 // out q3
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ sub v18.8h, v18.8h, v20.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ urshr v10.8h, v12.8h, #4 // out q4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ urshr v11.8h, v12.8h, #4 // out q5
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd8
+ bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_8_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #2
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_6_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_8_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_16_8_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.8h}, [x16], x1 // p6
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v18.8h}, [x16], x1 // p5
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v19.8h}, [x16], x1 // p4
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v27.8h}, [x0], x1 // q3
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v28.8h}, [x0], x1 // q4
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v29.8h}, [x0], x1 // q5
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v30.8h}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_8_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.8h}, [x16], x1 // p5
+ st1 {v6.8h}, [x0], x1 // q0
+ st1 {v1.8h}, [x16], x1 // p4
+ st1 {v7.8h}, [x0], x1 // q1
+ st1 {v2.8h}, [x16], x1 // p3
+ st1 {v8.8h}, [x0], x1 // q2
+ st1 {v3.8h}, [x16], x1 // p2
+ st1 {v9.8h}, [x0], x1 // q3
+ st1 {v4.8h}, [x16], x1 // p1
+ st1 {v10.8h}, [x0], x1 // q4
+ st1 {v5.8h}, [x16], x1 // p0
+ st1 {v11.8h}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_8_neon
+ mov x15, x30
+ sub x16, x0, #16
+ ld1 {v16.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v17.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v18.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v19.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v28.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v29.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v30.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v31.8h}, [x0], x1
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_8_wd16
+
+ sub x0, x0, x1, lsl #3
+ sub x16, x0, #16
+
+ transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.8h}, [x16], x1
+ st1 {v6.8h}, [x0], x1
+ st1 {v17.8h}, [x16], x1
+ st1 {v7.8h}, [x0], x1
+ st1 {v0.8h}, [x16], x1
+ st1 {v8.8h}, [x0], x1
+ st1 {v1.8h}, [x16], x1
+ st1 {v9.8h}, [x0], x1
+ st1 {v2.8h}, [x16], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v3.8h}, [x16], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v4.8h}, [x16], x1
+ st1 {v30.8h}, [x0], x1
+ st1 {v5.8h}, [x16], x1
+ st1 {v31.8h}, [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ mov x11, x30
+ mov w8, w7 // bitdepth_max
+ clz w9, w8
+ mov w10, #24
+ sub w9, w10, w9 // bitdepth_min_8
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x03
+.ifc \dir, v
+ ld1 {v0.8b}, [x4], #8
+ ld1 {v1.8b}, [x3], #8
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.8b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.2s, #0xff
+ dup v13.2s, w6 // vmask[0]
+ dup v31.8h, w9 // bitdepth_min_8
+
+ and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
+ and v1.8b, v1.8b, v2.8b
+ cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
+ movi v4.8b, #1
+ ld1r {v6.8b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.2s, v1.2s, v2.2s // L != 0
+ mul v1.2s, v1.2s, v4.2s // L
+.ifc \type, y
+ dup v15.2s, w2 // vmask[2]
+.endif
+ dup v14.2s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ cmp x16, #0
+ b.eq 7f // if (!L) continue;
+ neg v5.8b, v5.8b // -sharp[0]
+ movrel x16, word_12
+ ushr v12.8b, v1.8b, #4 // H
+ ld1 {v16.2s}, [x16]
+ sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
+.endif
+ movi v7.8b, #2
+ umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
+ add v0.8b, v1.8b, v7.8b // L + 2
+ umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
+ add v0.8b, v0.8b, v0.8b // 2*(L + 2)
+ cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
+ uxtl v12.8h, v12.8b
+ add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
+ cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
+ uxtl v11.8h, v11.8b
+ uxtl v10.8h, v10.8b
+ and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
+ sxtl v14.8h, v14.8b
+ sxtl v13.8h, v13.8b
+.ifc \type, y
+ sxtl v15.8h, v15.8b
+.endif
+ ushl v12.8h, v12.8h, v31.8h
+ ushl v11.8h, v11.8h, v31.8h
+ ushl v10.8h, v10.8h, v31.8h
+
+.ifc \type, y
+ tst w2, #0x03
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x03
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #3
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #2 // vmask[0] >>= 2
+ lsr w7, w7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr w2, w2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
new file mode 100644
index 0000000000..f8dc0df4d8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -0,0 +1,1303 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter7_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_8bpc_neon
+ bl wiener_filter7_hv_8bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_8bpc_neon
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_8bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_8bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ add v24.8h, v24.8h, v20.8h
+ ld1 {v26.8h, v27.8h}, [x14], #32
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v28.8h, v26.8h, v18.8h
+ ld1 {v22.8h, v23.8h}, [x12], #32
+
+ add v16.8h, v26.8h, v16.8h
+ add v25.8h, v25.8h, v21.8h
+
+ smull v2.4s, v22.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v28.4h, v1.h[5]
+ smlal v2.4s, v16.4h, v1.h[6]
+ add v29.8h, v27.8h, v19.8h
+ smull2 v3.4s, v22.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v28.8h, v1.h[5]
+ smlal2 v3.4s, v16.8h, v1.h[6]
+ add v17.8h, v27.8h, v17.8h
+ smull v4.4s, v23.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v29.4h, v1.h[5]
+ smlal v4.4s, v17.4h, v1.h[6]
+ smull2 v5.4s, v23.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v29.8h, v1.h[5]
+ smlal2 v5.4s, v17.8h, v1.h[6]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ ld1 {v20.8h, v21.8h}, [x11], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v26.8h, v27.8h}, [x13], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v28.8h, v29.8h}, [x14], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v26.8h, v20.8h, v26.8h
+
+ ld1 {v24.8h, v25.8h}, [x12], #32
+ add v28.8h, v18.8h, v28.8h
+
+ add v16.8h, v16.8h, v6.8h
+ add v27.8h, v21.8h, v27.8h
+
+ smull v18.4s, v24.4h, v1.h[3]
+ smlal v18.4s, v26.4h, v1.h[4]
+ smlal v18.4s, v28.4h, v1.h[5]
+ smlal v18.4s, v16.4h, v1.h[6]
+ add v29.8h, v19.8h, v29.8h
+ smull2 v19.4s, v24.8h, v1.h[3]
+ smlal2 v19.4s, v26.8h, v1.h[4]
+ smlal2 v19.4s, v28.8h, v1.h[5]
+ smlal2 v19.4s, v16.8h, v1.h[6]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v25.4h, v1.h[3]
+ smlal v20.4s, v27.4h, v1.h[4]
+ smlal v20.4s, v29.4h, v1.h[5]
+ smlal v20.4s, v17.4h, v1.h[6]
+ smull2 v21.4s, v25.8h, v1.h[3]
+ smlal2 v21.4s, v27.8h, v1.h[4]
+ smlal2 v21.4s, v29.8h, v1.h[5]
+ smlal2 v21.4s, v17.8h, v1.h[6]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter5_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_8bpc_neon
+ bl wiener_filter5_hv_8bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_8bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_8bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_8bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ ld1 {v16.8h, v17.8h}, [x11], #32
+
+ add v24.8h, v22.8h, v18.8h
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v16.8h, v22.8h, v16.8h
+ add v25.8h, v23.8h, v19.8h
+
+ smull v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v16.4h, v1.h[5]
+ add v17.8h, v23.8h, v17.8h
+ smull2 v3.4s, v20.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v16.8h, v1.h[5]
+ smull v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v17.4h, v1.h[5]
+ smull2 v5.4s, v21.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v17.8h, v1.h[5]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 2x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ ld1 {v18.8h, v19.8h}, [x12], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v24.8h, v25.8h}, [x14], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ add v24.8h, v24.8h, v18.8h
+ add v16.8h, v16.8h, v6.8h
+
+ smull v18.4s, v20.4h, v1.h[3]
+ smlal v18.4s, v24.4h, v1.h[4]
+ smlal v18.4s, v16.4h, v1.h[5]
+ add v25.8h, v25.8h, v19.8h
+ smull2 v19.4s, v20.8h, v1.h[3]
+ smlal2 v19.4s, v24.8h, v1.h[4]
+ smlal2 v19.4s, v16.8h, v1.h[5]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v21.4h, v1.h[3]
+ smlal v20.4s, v25.4h, v1.h[4]
+ smlal v20.4s, v17.4h, v1.h[5]
+ smull2 v21.4s, v21.8h, v1.h[3]
+ smlal2 v21.4s, v25.8h, v1.h[4]
+ smlal2 v21.4s, v17.8h, v1.h[5]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box3_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #2
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2]
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 2 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ uaddl v3.8h, v0.8b, v16.8b
+ ext v20.16b, v1.16b, v2.16b, #2
+ uaddw v3.8h, v3.8h, v17.8b
+
+ ext v21.16b, v1.16b, v2.16b, #4
+
+ uaddl v26.4s, v1.4h, v20.4h
+ uaddl2 v27.4s, v1.8h, v20.8h
+ uaddw v26.4s, v26.4s, v21.4h
+ uaddw2 v27.4s, v27.4s, v21.8h
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box5_row_h_8bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #3
+ ld1 {v0.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 3 + 1)
+ ldr b30, [x3, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w4, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v0.16b, v0.16b, #3
+ ext v19.16b, v0.16b, v0.16b, #4
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddl v24.8h, v17.8b, v18.8b
+ uaddw v3.8h, v3.8h, v19.8b
+ add v3.8h, v3.8h, v24.8h
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v18.16b, v1.16b, v2.16b, #6
+ ext v19.16b, v1.16b, v2.16b, #8
+
+ uaddl v26.4s, v1.4h, v16.4h
+ uaddl2 v27.4s, v1.8h, v16.8h
+ uaddl v16.4s, v17.4h, v18.4h
+ uaddl2 v17.4s, v17.8h, v18.8h
+ uaddw v26.4s, v26.4s, v19.4h
+ uaddw2 v27.4s, v27.4s, v19.8h
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w4, w4, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
+// int32_t *sumsq5, int16_t *sum5,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box35_row_h_8bpc_neon, export=1
+ add w6, w6, #2 // w += 2
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x4, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x5, x5, #3
+ ld1 {v0.16b}, [x5], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.16b}, [x5], #16
+ ld1 {v1.s}[3], [x4], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ b 2f
+
+1:
+ ld1 {v0.16b}, [x5], #16
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x5, x5, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w6, #(2 + 16 - 3 + 1)
+ ldr b30, [x5, w13, sxtw]
+ // Fill v30 with the right padding pixel
+ dup v30.16b, v30.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w6, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w6, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v19.16b, v0.16b, v0.16b, #4
+ ext v18.16b, v0.16b, v0.16b, #3
+ uaddl v3.8h, v16.8b, v17.8b
+ uaddl v24.8h, v0.8b, v19.8b
+ uaddw v3.8h, v3.8h, v18.8b
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v19.16b, v1.16b, v2.16b, #8
+ ext v18.16b, v1.16b, v2.16b, #6
+
+ st1 {v3.8h}, [x1], #16
+ add v3.8h, v3.8h, v24.8h
+
+ uaddl v26.4s, v16.4h, v17.4h
+ uaddl2 v27.4s, v16.8h, v17.8h
+ uaddl v16.4s, v1.4h, v19.4h
+ uaddl2 v17.4s, v1.8h, v19.8h
+ uaddw v26.4s, v26.4s, v18.4h
+ uaddw2 v27.4s, v27.4s, v18.8h
+
+ st1 {v26.4s,v27.4s}, [x0], #32
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+
+ subs w6, w6, #8
+
+ st1 {v3.8h}, [x3], #16
+ st1 {v26.4s,v27.4s}, [x2], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x5], #8
+ mov v1.16b, v2.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
new file mode 100644
index 0000000000..3b76b1ee2a
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -0,0 +1,1388 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter7_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_16bpc_neon
+ bl wiener_filter7_hv_16bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_16bpc_neon
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += p_stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_16bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_16bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v16.4s, v18.4h, v0.h[3]
+ smlal v16.4s, v19.4h, v0.h[2]
+ smlal v16.4s, v20.4h, v0.h[1]
+ smlal v16.4s, v21.4h, v0.h[0]
+ smull2 v17.4s, v18.8h, v0.h[3]
+ smlal2 v17.4s, v19.8h, v0.h[2]
+ smlal2 v17.4s, v20.8h, v0.h[1]
+ smlal2 v17.4s, v21.8h, v0.h[0]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ ld1 {v6.8h, v7.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[4]
+ smlal v2.4s, v18.4h, v0.h[5]
+ smlal v2.4s, v20.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[7]
+ smlal v2.4s, v24.4h, v0.h[6]
+ smlal v2.4s, v6.4h, v0.h[5]
+ smlal v2.4s, v6.4h, v0.h[4]
+ smull2 v3.4s, v16.8h, v0.h[4]
+ smlal2 v3.4s, v18.8h, v0.h[5]
+ smlal2 v3.4s, v20.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[7]
+ smlal2 v3.4s, v24.8h, v0.h[6]
+ smlal2 v3.4s, v6.8h, v0.h[5]
+ smlal2 v3.4s, v6.8h, v0.h[4]
+ smull v4.4s, v17.4h, v0.h[4]
+ smlal v4.4s, v19.4h, v0.h[5]
+ smlal v4.4s, v21.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[7]
+ smlal v4.4s, v25.4h, v0.h[6]
+ smlal v4.4s, v7.4h, v0.h[5]
+ smlal v4.4s, v7.4h, v0.h[4]
+ smull2 v5.4s, v17.8h, v0.h[4]
+ smlal2 v5.4s, v19.8h, v0.h[5]
+ smlal2 v5.4s, v21.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[7]
+ smlal2 v5.4s, v25.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ smlal2 v5.4s, v7.8h, v0.h[4]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v24.4s, v18.4h, v0.h[3]
+ smlal v24.4s, v19.4h, v0.h[2]
+ smlal v24.4s, v20.4h, v0.h[1]
+ smlal v24.4s, v21.4h, v0.h[0]
+ smull2 v25.4s, v18.8h, v0.h[3]
+ smlal2 v25.4s, v19.8h, v0.h[2]
+ smlal2 v25.4s, v20.8h, v0.h[1]
+ smlal2 v25.4s, v21.8h, v0.h[0]
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ ld1 {v8.8h, v9.8h}, [x14], #32
+
+ smull v1.4s, v16.4h, v0.h[4]
+ smlal v1.4s, v18.4h, v0.h[5]
+ smlal v1.4s, v20.4h, v0.h[6]
+ smlal v1.4s, v22.4h, v0.h[7]
+ smlal v1.4s, v24.4h, v0.h[6]
+ smlal v1.4s, v8.4h, v0.h[5]
+ smlal v1.4s, v6.4h, v0.h[4]
+ smull2 v5.4s, v16.8h, v0.h[4]
+ smlal2 v5.4s, v18.8h, v0.h[5]
+ smlal2 v5.4s, v20.8h, v0.h[6]
+ smlal2 v5.4s, v22.8h, v0.h[7]
+ smlal2 v5.4s, v24.8h, v0.h[6]
+ smlal2 v5.4s, v8.8h, v0.h[5]
+ smlal2 v5.4s, v6.8h, v0.h[4]
+ smull v26.4s, v17.4h, v0.h[4]
+ smlal v26.4s, v19.4h, v0.h[5]
+ smlal v26.4s, v21.4h, v0.h[6]
+ smlal v26.4s, v23.4h, v0.h[7]
+ smlal v26.4s, v25.4h, v0.h[6]
+ smlal v26.4s, v9.4h, v0.h[5]
+ smlal v26.4s, v7.4h, v0.h[4]
+ smull2 v16.4s, v17.8h, v0.h[4]
+ smlal2 v16.4s, v19.8h, v0.h[5]
+ smlal2 v16.4s, v21.8h, v0.h[6]
+ smlal2 v16.4s, v23.8h, v0.h[7]
+ smlal2 v16.4s, v25.8h, v0.h[6]
+ smlal2 v16.4s, v9.8h, v0.h[5]
+ smlal2 v16.4s, v7.8h, v0.h[4]
+ srshl v1.4s, v1.4s, v27.4s // -round_bits_v
+ srshl v5.4s, v5.4s, v27.4s
+ srshl v26.4s, v26.4s, v27.4s
+ srshl v16.4s, v16.4s, v27.4s
+ sqxtun v18.4h, v1.4s
+ sqxtun2 v18.8h, v5.4s
+ sqxtun v19.4h, v26.4s
+ sqxtun2 v19.8h, v16.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v18.8h, v18.8h, v28.8h // bitdepth_max
+ umin v19.8h, v19.8h, v28.8h
+ subs w4, w4, #16
+
+ st1 {v18.8h, v19.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter5_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_16bpc_neon
+ bl wiener_filter5_hv_16bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_16bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_16bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v16.4s, v17.4h, v0.h[3]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[1]
+ smull2 v17.4s, v17.8h, v0.h[3]
+ smlal2 v17.4s, v18.8h, v0.h[2]
+ smlal2 v17.4s, v19.8h, v0.h[1]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_16bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[5]
+ smlal v2.4s, v18.4h, v0.h[6]
+ smlal v2.4s, v20.4h, v0.h[7]
+ smlal v2.4s, v22.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[5]
+ smull2 v3.4s, v16.8h, v0.h[5]
+ smlal2 v3.4s, v18.8h, v0.h[6]
+ smlal2 v3.4s, v20.8h, v0.h[7]
+ smlal2 v3.4s, v22.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[5]
+ smull v4.4s, v17.4h, v0.h[5]
+ smlal v4.4s, v19.4h, v0.h[6]
+ smlal v4.4s, v21.4h, v0.h[7]
+ smlal v4.4s, v23.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[5]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 2x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v24.4s, v17.4h, v0.h[3]
+ smlal v24.4s, v18.4h, v0.h[2]
+ smlal v24.4s, v19.4h, v0.h[1]
+ smull2 v25.4s, v17.8h, v0.h[3]
+ smlal2 v25.4s, v18.8h, v0.h[2]
+ smlal2 v25.4s, v19.8h, v0.h[1]
+
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ smull v8.4s, v16.4h, v0.h[5]
+ smlal v8.4s, v18.4h, v0.h[6]
+ smlal v8.4s, v20.4h, v0.h[7]
+ smlal v8.4s, v22.4h, v0.h[6]
+ smlal v8.4s, v6.4h, v0.h[5]
+ smull2 v9.4s, v16.8h, v0.h[5]
+ smlal2 v9.4s, v18.8h, v0.h[6]
+ smlal2 v9.4s, v20.8h, v0.h[7]
+ smlal2 v9.4s, v22.8h, v0.h[6]
+ smlal2 v9.4s, v6.8h, v0.h[5]
+ smull v1.4s, v17.4h, v0.h[5]
+ smlal v1.4s, v19.4h, v0.h[6]
+ smlal v1.4s, v21.4h, v0.h[7]
+ smlal v1.4s, v23.4h, v0.h[6]
+ smlal v1.4s, v7.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ srshl v8.4s, v8.4s, v27.4s // -round_bits_v
+ srshl v9.4s, v9.4s, v27.4s
+ srshl v1.4s, v1.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v8.4h, v8.4s
+ sqxtun2 v8.8h, v9.4s
+ sqxtun v9.4h, v1.4s
+ sqxtun2 v9.8h, v5.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v8.8h, v8.8h, v28.8h // bitdepth_max
+ umin v9.8h, v9.8h, v28.8h
+
+ subs w4, w4, #16
+
+ st1 {v8.8h, v9.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box3_row_h_16bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #4
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.d}[1], [x2]
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 2x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+
+2:
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 2 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w4, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ subs w4, w4, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x3], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box5_row_h_16bpc_neon, export=1
+ add w4, w4, #2 // w += 2
+
+ tst w5, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x2, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x3, x3, #6
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+
+2:
+ tst w5, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w4, #(2 + 16 - 3 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w4, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ ext v26.16b, v0.16b, v1.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
+
+ add v6.8h, v6.8h, v26.8h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v6.8h, v6.8h, v27.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+
+ subs w4, w4, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ b.le 9f
+ tst w5, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x3], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
+// int32_t *sumsq5, int16_t *sum5,
+// const pixel (*left)[4],
+// const pixel *src, const int w,
+// const enum LrEdgeFlags edges);
+function sgr_box35_row_h_16bpc_neon, export=1
+ add w6, w6, #2 // w += 2
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ cbnz x4, 0f
+
+ // LR_HAVE_LEFT && left == NULL
+ sub x5, x5, #6
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ ld1 {v2.d}[1], [x4], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x5, x5, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ b 2f
+
+1:
+ ld1 {v0.8h, v1.8h}, [x5], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ // Move x5 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x5, x5, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w6, #(2 + 16 - 3 + 1)
+ ldr h30, [x5, w13, sxtw #1]
+ // Fill v30 with the right padding pixel
+ dup v30.8h, v30.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w6, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w6, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v1.16b, #2
+ ext v17.16b, v0.16b, v1.16b, #4
+ ext v19.16b, v0.16b, v1.16b, #8
+ ext v18.16b, v0.16b, v1.16b, #6
+
+ add v20.8h, v16.8h, v17.8h
+ add v21.8h, v0.8h, v19.8h
+ add v20.8h, v20.8h, v18.8h
+
+ umull v22.4s, v16.4h, v16.4h
+ umlal v22.4s, v17.4h, v17.4h
+ umlal v22.4s, v18.4h, v18.4h
+
+ umull2 v23.4s, v16.8h, v16.8h
+ umlal2 v23.4s, v17.8h, v17.8h
+ umlal2 v23.4s, v18.8h, v18.8h
+
+ add v21.8h, v21.8h, v20.8h
+ st1 {v20.8h}, [x1], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+
+ umlal v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v19.4h, v19.4h
+
+ umlal2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v19.8h, v19.8h
+
+ subs w6, w6, #8
+
+ st1 {v21.8h}, [x3], #16
+ st1 {v22.4s,v23.4s}, [x2], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ ld1 {v1.8h}, [x5], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ ret
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S
new file mode 100644
index 0000000000..745f6c20f4
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+// int32_t *AA, int16_t *BB,
+// const int w, const int s,
+// const int bitdepth_max);
+function sgr_box3_vert_neon, export=1
+ stp d8, d9, [sp, #-0x30]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+
+ add w4, w4, #2
+ clz w9, w6 // bitdepth_max
+ dup v28.4s, w5 // strength
+
+ ldp x5, x6, [x0]
+ ldr x0, [x0, #16]
+ ldp x7, x8, [x1]
+ ldr x1, [x1, #16]
+
+ movi v31.4s, #9 // n
+
+ sub w9, w9, #24 // -bitdepth_min_8
+ movrel x12, X(sgr_x_by_x)
+ mov w13, #455 // one_by_x
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ dup v6.8h, w9 // -bitdepth_min_8
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
+ movi v29.8h, #1, lsl #8
+ dup v30.4s, w13 // one_by_x
+
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ ld1 {v12.8h}, [x7], #16
+ ld1 {v13.8h}, [x8], #16
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ ld1 {v2.8h}, [x1], #16
+1:
+
+ add v8.4s, v8.4s, v10.4s
+ add v9.4s, v9.4s, v11.4s
+
+ add v12.8h, v12.8h, v13.8h
+
+ subs w4, w4, #8
+ add v0.4s, v0.4s, v8.4s
+ add v1.4s, v1.4s, v9.4s
+ add v2.8h, v2.8h, v12.8h
+
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v1.4s, v1.4s, v7.4s
+ srshl v4.8h, v2.8h, v6.8h
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v4.4h, v4.4h // b * b
+ umull2 v4.4s, v4.8h, v4.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ ld1 {v12.8h}, [x7], #16
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v4.8b
+ add v5.8b, v5.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v5.8b, v1.8b, v5.8b
+ ld1 {v13.8h}, [x8], #16
+ add v5.8b, v5.8b, v25.8b
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ uxtl v5.8h, v5.8b // x
+
+ umull v3.4s, v5.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v5.8h, v29.8h, v5.8h // 256 - x
+ ld1 {v2.8h}, [x1], #16
+
+ st1 {v3.4s, v4.4s}, [x2], #32
+ st1 {v5.8h}, [x3], #16
+ b.gt 1b
+
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x30
+ ret
+endfunc
+
+// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+// int32_t *AA, int16_t *BB,
+// const int w, const int s,
+// const int bitdepth_max);
+function sgr_box5_vert_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ add w4, w4, #2
+ clz w15, w6 // bitdepth_max
+ dup v28.4s, w5 // strength
+
+ ldp x5, x6, [x0]
+ ldp x7, x8, [x0, #16]
+ ldr x0, [x0, #32]
+ ldp x9, x10, [x1]
+ ldp x11, x12, [x1, #16]
+ ldr x1, [x1, #32]
+
+ movi v31.4s, #25 // n
+
+ sub w15, w15, #24 // -bitdepth_min_8
+ movrel x13, X(sgr_x_by_x)
+ mov w14, #164 // one_by_x
+ ld1 {v16.16b, v17.16b, v18.16b}, [x13]
+ dup v6.8h, w15 // -bitdepth_min_8
+ movi v19.16b, #5
+ movi v24.8b, #254 // idx of last 1
+ saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
+ movi v29.8h, #1, lsl #8
+ dup v30.4s, w14 // one_by_x
+
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ ld1 {v12.4s, v13.4s}, [x7], #32
+ ld1 {v14.4s, v15.4s}, [x8], #32
+ ld1 {v20.8h}, [x9], #16
+ ld1 {v21.8h}, [x10], #16
+ ld1 {v22.8h}, [x11], #16
+ ld1 {v23.8h}, [x12], #16
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ ld1 {v2.8h}, [x1], #16
+
+1:
+ add v8.4s, v8.4s, v10.4s
+ add v9.4s, v9.4s, v11.4s
+ add v12.4s, v12.4s, v14.4s
+ add v13.4s, v13.4s, v15.4s
+
+ add v20.8h, v20.8h, v21.8h
+ add v22.8h, v22.8h, v23.8h
+
+ add v0.4s, v0.4s, v8.4s
+ add v1.4s, v1.4s, v9.4s
+ add v2.8h, v2.8h, v20.8h
+
+ add v0.4s, v0.4s, v12.4s
+ add v1.4s, v1.4s, v13.4s
+ add v2.8h, v2.8h, v22.8h
+
+ subs w4, w4, #8
+
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v1.4s, v1.4s, v7.4s
+ srshl v4.8h, v2.8h, v6.8h
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v4.4h, v4.4h // b * b
+ umull2 v4.4s, v4.8h, v4.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ ld1 {v8.4s, v9.4s}, [x5], #32
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ ld1 {v10.4s, v11.4s}, [x6], #32
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ ld1 {v12.4s, v13.4s}, [x7], #32
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ ld1 {v14.4s, v15.4s}, [x8], #32
+ add v25.8b, v25.8b, v26.8b
+ cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v4.8b
+ ld1 {v20.8h}, [x9], #16
+ add v5.8b, v5.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ ld1 {v21.8h}, [x10], #16
+ add v5.8b, v1.8b, v5.8b
+ ld1 {v22.8h}, [x11], #16
+ add v5.8b, v5.8b, v25.8b
+ ld1 {v23.8h}, [x12], #16
+ uxtl v5.8h, v5.8b // x
+
+ ld1 {v0.4s, v1.4s}, [x0], #32
+ umull v3.4s, v5.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v5.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v5.8h, v29.8h, v5.8h // 256 - x
+ ld1 {v2.8h}, [x1], #16
+
+ st1 {v3.4s, v4.4s}, [x2], #32
+ st1 {v5.8h}, [x3], #16
+ b.gt 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
new file mode 100644
index 0000000000..1373f9ace3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@@ -0,0 +1,751 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
+// const pixel *src,
+// const ptrdiff_t src_stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h);
+function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ ldp x7, x8, [x3]
+ ldp x9, x3, [x3, #16]
+ ldp x10, x11, [x4]
+ ldp x12, x4, [x4, #16]
+
+ mov x13, #FILTER_OUT_STRIDE
+ cmp w6, #1
+ add x2, x1, x2 // src + stride
+ csel x2, x1, x2, le // if (h <= 1) x2 = x1
+ add x13, x0, x13, lsl #1
+
+ movi v30.8h, #3
+ movi v31.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x10], #32
+ ld1 {v2.8h, v3.8h}, [x11], #32
+ ld1 {v4.8h, v5.8h}, [x12], #32
+ ld1 {v6.8h, v7.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48
+ ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48
+
+2:
+ ext v8.16b, v0.16b, v1.16b, #2 // [0][1]
+ ext v9.16b, v2.16b, v3.16b, #2 // [1][1]
+ ext v10.16b, v4.16b, v5.16b, #2 // [2][1]
+ ext v11.16b, v0.16b, v1.16b, #4 // [0][2]
+ ext v12.16b, v2.16b, v3.16b, #4 // [1][2]
+ ext v13.16b, v4.16b, v5.16b, #4 // [2][2]
+
+ add v14.8h, v2.8h, v8.8h // [1][0] + [0][1]
+ add v15.8h, v9.8h, v10.8h // [1][1] + [2][1]
+
+ add v28.8h, v0.8h, v11.8h // [0][0] + [0][2]
+ add v14.8h, v14.8h, v12.8h // () + [1][2]
+ add v29.8h, v4.8h, v13.8h // [2][0] + [2][2]
+
+ ext v8.16b, v6.16b, v7.16b, #2 // [3][1]
+ ext v11.16b, v6.16b, v7.16b, #4 // [3][2]
+
+ add v14.8h, v14.8h, v15.8h // mid
+ add v15.8h, v28.8h, v29.8h // corners
+
+ add v28.8h, v4.8h, v9.8h // [2][0] + [1][1]
+ add v29.8h, v10.8h, v8.8h // [2][1] + [3][1]
+
+ add v2.8h, v2.8h, v12.8h // [1][0] + [1][2]
+ add v28.8h, v28.8h, v13.8h // () + [2][2]
+ add v4.8h, v6.8h, v11.8h // [3][0] + [3][2]
+
+ add v0.8h, v28.8h, v29.8h // mid
+ add v2.8h, v2.8h, v4.8h // corners
+
+ shl v4.8h, v14.8h, #2
+ mla v4.8h, v15.8h, v30.8h // * 3 -> a
+
+ shl v0.8h, v0.8h, #2
+ mla v0.8h, v2.8h, v30.8h // * 3 -> a
+
+ ext v8.16b, v16.16b, v17.16b, #4 // [0][1]
+ ext v9.16b, v17.16b, v18.16b, #4
+ ext v10.16b, v16.16b, v17.16b, #8 // [0][2]
+ ext v11.16b, v17.16b, v18.16b, #8
+ ext v12.16b, v19.16b, v20.16b, #4 // [1][1]
+ ext v13.16b, v20.16b, v21.16b, #4
+ add v8.4s, v8.4s, v19.4s // [0][1] + [1][0]
+ add v9.4s, v9.4s, v20.4s
+ add v16.4s, v16.4s, v10.4s // [0][0] + [0][2]
+ add v17.4s, v17.4s, v11.4s
+ ext v14.16b, v19.16b, v20.16b, #8 // [1][2]
+ ext v15.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // () + [2][0]
+ add v17.4s, v17.4s, v23.4s
+ add v28.4s, v12.4s, v14.4s // [1][1] + [1][2]
+ add v29.4s, v13.4s, v15.4s
+ ext v10.16b, v22.16b, v23.16b, #4 // [2][1]
+ ext v11.16b, v23.16b, v24.16b, #4
+ add v8.4s, v8.4s, v28.4s // mid (incomplete)
+ add v9.4s, v9.4s, v29.4s
+
+ add v19.4s, v19.4s, v14.4s // [1][0] + [1][2]
+ add v20.4s, v20.4s, v15.4s
+ add v14.4s, v22.4s, v12.4s // [2][0] + [1][1]
+ add v15.4s, v23.4s, v13.4s
+
+ ext v12.16b, v22.16b, v23.16b, #8 // [2][2]
+ ext v13.16b, v23.16b, v24.16b, #8
+ ext v28.16b, v25.16b, v26.16b, #4 // [3][1]
+ ext v29.16b, v26.16b, v27.16b, #4
+ add v8.4s, v8.4s, v10.4s // () + [2][1] = mid
+ add v9.4s, v9.4s, v11.4s
+ add v14.4s, v14.4s, v10.4s // () + [2][1]
+ add v15.4s, v15.4s, v11.4s
+ ext v10.16b, v25.16b, v26.16b, #8 // [3][2]
+ ext v11.16b, v26.16b, v27.16b, #8
+ add v16.4s, v16.4s, v12.4s // () + [2][2] = corner
+ add v17.4s, v17.4s, v13.4s
+
+ add v12.4s, v12.4s, v28.4s // [2][2] + [3][1]
+ add v13.4s, v13.4s, v29.4s
+ add v25.4s, v25.4s, v10.4s // [3][0] + [3][2]
+ add v26.4s, v26.4s, v11.4s
+
+ add v14.4s, v14.4s, v12.4s // mid
+ add v15.4s, v15.4s, v13.4s
+ add v19.4s, v19.4s, v25.4s // corner
+ add v20.4s, v20.4s, v26.4s
+
+.if \bpc == 8
+ ld1 {v25.8b}, [x1], #8 // src
+ ld1 {v26.8b}, [x2], #8
+.else
+ ld1 {v25.8h}, [x1], #16 // src
+ ld1 {v26.8h}, [x2], #16
+.endif
+
+ shl v8.4s, v8.4s, #2
+ shl v9.4s, v9.4s, #2
+ mla v8.4s, v16.4s, v31.4s // * 3 -> b
+ mla v9.4s, v17.4s, v31.4s
+
+.if \bpc == 8
+ uxtl v25.8h, v25.8b // src
+ uxtl v26.8h, v26.8b
+.endif
+
+ shl v14.4s, v14.4s, #2
+ shl v15.4s, v15.4s, #2
+ mla v14.4s, v19.4s, v31.4s // * 3 -> b
+ mla v15.4s, v20.4s, v31.4s
+
+ umlal v8.4s, v4.4h, v25.4h // b + a * src
+ umlal2 v9.4s, v4.8h, v25.8h
+ umlal v14.4s, v0.4h, v26.4h // b + a * src
+ umlal2 v15.4s, v0.8h, v26.8h
+ mov v0.16b, v1.16b
+ rshrn v8.4h, v8.4s, #9
+ rshrn2 v8.8h, v9.4s, #9
+ mov v2.16b, v3.16b
+ rshrn v14.4h, v14.4s, #9
+ rshrn2 v14.8h, v15.4s, #9
+ subs w5, w5, #8
+ mov v4.16b, v5.16b
+ st1 {v8.8h}, [x0], #16
+ mov v6.16b, v7.16b
+ st1 {v14.8h}, [x13], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ mov v25.16b, v27.16b
+ ld1 {v1.8h}, [x10], #16
+ ld1 {v3.8h}, [x11], #16
+ ld1 {v5.8h}, [x12], #16
+ ld1 {v7.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x8], #32
+ ld1 {v23.4s, v24.4s}, [x9], #32
+ ld1 {v26.4s, v27.4s}, [x3], #32
+ b 2b
+
+3:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
+// const int32_t **a, const int16_t **b,
+// const int w, const int w1,
+// const int bitdepth_max);
+function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
+ ldp x7, x8, [x1]
+ ldr x1, [x1, #16]
+ ldp x9, x10, [x2]
+ ldr x2, [x2, #16]
+
+ dup v31.8h, w4
+ dup v30.8h, w5
+
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x10], #32
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48
+
+2:
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ ext v25.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v4.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v4.4s, v4.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v4.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v4.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+ ld1 {v19.8b}, [x0] // src
+.else
+ ld1 {v19.8h}, [x0] // src
+.endif
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v4.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+.if \bpc == 8
+ uxtl v19.8h, v19.8b // src
+.endif
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+
+ subs w3, w3, #8
+
+ // weighted1
+ shl v19.8h, v19.8h, #4 // u
+ mov v4.16b, v5.16b
+
+ sub v25.8h, v25.8h, v19.8h // t1 - u
+ ld1 {v1.8h}, [x9], #16
+ ushll v26.4s, v19.4h, #7 // u << 7
+ ushll2 v27.4s, v19.8h, #7 // u << 7
+ ld1 {v3.8h}, [x10], #16
+ smlal v26.4s, v25.4h, v31.4h // v
+ smlal2 v27.4s, v25.8h, v31.8h // v
+ ld1 {v5.8h}, [x2], #16
+.if \bpc == 8
+ rshrn v26.4h, v26.4s, #11
+ rshrn2 v26.8h, v27.4s, #11
+ mov v16.16b, v18.16b
+ sqxtun v26.8b, v26.8h
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ st1 {v26.8b}, [x0], #8
+.else
+ sqrshrun v26.4h, v26.4s, #11
+ sqrshrun2 v26.8h, v27.4s, #11
+ mov v16.16b, v18.16b
+ umin v26.8h, v26.8h, v30.8h
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ st1 {v26.8h}, [x0], #16
+.endif
+
+ b.le 3f
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x8], #32
+ ld1 {v23.4s, v24.4s}, [x1], #32
+ b 2b
+
+3:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
+// const pixel *src,
+// const ptrdiff_t stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h);
+function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ ldp x3, x7, [x3]
+ ldp x4, x8, [x4]
+ mov x10, #FILTER_OUT_STRIDE
+ cmp w6, #1
+ add x2, x1, x2 // src + stride
+ csel x2, x1, x2, le // if (h <= 1) x2 = x1
+ add x10, x0, x10, lsl #1
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ mul v8.8h, v25.8h, v4.8h // * 5
+ mla v8.8h, v23.8h, v6.8h // * 6
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+ ld1 {v30.8b}, [x2], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+ ld1 {v30.8h}, [x2], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ mul v9.4s, v19.4s, v5.4s // * 5
+ mla v9.4s, v24.4s, v7.4s // * 6
+ mul v10.4s, v20.4s, v5.4s // * 5
+ mla v10.4s, v25.4s, v7.4s // * 6
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+ uxtl v30.8h, v30.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ umlal v9.4s, v8.4h, v30.4h // b + a * src
+ umlal2 v10.4s, v8.8h, v30.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ rshrn v9.4h, v9.4s, #8
+ rshrn2 v9.8h, v10.4s, #8
+ subs w5, w5, #8
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+ st1 {v9.8h}, [x10], #16
+
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+9:
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret
+endfunc
+
+// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const int32_t **a,
+// const int16_t **b,
+// const int w, const int h,
+// const int w1,
+// const int bitdepth_max);
+function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
+ stp d8, d9, [sp, #-0x30]!
+ str d10, [sp, #0x10]
+ stp d14, d15, [sp, #0x20]
+
+ dup v14.8h, w6
+ dup v15.8h, w7
+
+ ldp x2, x7, [x2]
+ ldp x3, x8, [x3]
+ cmp w5, #1
+ add x1, x0, x1 // src + stride
+ // if (h <= 1), set the pointer to the second row to any dummy buffer
+ // we can clobber (x2 in this case)
+ csel x1, x2, x1, le
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ mul v8.8h, v25.8h, v4.8h // * 5
+ mla v8.8h, v23.8h, v6.8h // * 6
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x0]
+ ld1 {v30.8b}, [x1]
+.else
+ ld1 {v31.8h}, [x0]
+ ld1 {v30.8h}, [x1]
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ mul v9.4s, v19.4s, v5.4s // * 5
+ mla v9.4s, v24.4s, v7.4s // * 6
+ mul v10.4s, v20.4s, v5.4s // * 5
+ mla v10.4s, v25.4s, v7.4s // * 6
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+ uxtl v30.8h, v30.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ umlal v9.4s, v8.4h, v30.4h // b + a * src
+ umlal2 v10.4s, v8.8h, v30.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ rshrn v9.4h, v9.4s, #8
+ rshrn2 v9.8h, v10.4s, #8
+
+ subs w4, w4, #8
+
+ // weighted1
+ shl v31.8h, v31.8h, #4 // u
+ shl v30.8h, v30.8h, #4
+ mov v2.16b, v3.16b
+
+ sub v16.8h, v16.8h, v31.8h // t1 - u
+ sub v9.8h, v9.8h, v30.8h
+ ld1 {v1.8h}, [x3], #16
+ ushll v22.4s, v31.4h, #7 // u << 7
+ ushll2 v23.4s, v31.8h, #7
+ ushll v24.4s, v30.4h, #7
+ ushll2 v25.4s, v30.8h, #7
+ ld1 {v3.8h}, [x8], #16
+ smlal v22.4s, v16.4h, v14.4h // v
+ smlal2 v23.4s, v16.8h, v14.8h
+ mov v16.16b, v18.16b
+ smlal v24.4s, v9.4h, v14.4h
+ smlal2 v25.4s, v9.8h, v14.8h
+ mov v19.16b, v21.16b
+.if \bpc == 8
+ rshrn v22.4h, v22.4s, #11
+ rshrn2 v22.8h, v23.4s, #11
+ rshrn v23.4h, v24.4s, #11
+ rshrn2 v23.8h, v25.4s, #11
+ sqxtun v22.8b, v22.8h
+ sqxtun v23.8b, v23.8h
+ st1 {v22.8b}, [x0], #8
+ st1 {v23.8b}, [x1], #8
+.else
+ sqrshrun v22.4h, v22.4s, #11
+ sqrshrun2 v22.8h, v23.4s, #11
+ sqrshrun v23.4h, v24.4s, #11
+ sqrshrun2 v23.8h, v25.4s, #11
+ umin v22.8h, v22.8h, v15.8h
+ umin v23.8h, v23.8h, v15.8h
+ st1 {v22.8h}, [x0], #16
+ st1 {v23.8h}, [x1], #16
+.endif
+
+ b.le 3f
+ ld1 {v17.4s, v18.4s}, [x2], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ ldp d14, d15, [sp, #0x20]
+ ldr d10, [sp, #0x10]
+ ldp d8, d9, [sp], 0x30
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+ ldr x8, [sp]
+.else
+ ldp x8, x9, [sp]
+.endif
+ cmp w7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+ dup v29.8h, w9
+.endif
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x9
+ sub x3, x3, x9
+.else
+ sub x1, x1, x9, lsl #1
+ sub x3, x3, x9, lsl #1
+.endif
+ sub x8, x8, x9, lsl #1
+ mov w9, w6
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v16.8h}, [x11], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs w6, w6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v16.8h, v16.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ sqrshrun v19.4h, v19.4s, #11
+ sqrshrun2 v19.8h, v20.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ umin v19.8h, v19.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+ st1 {v19.8h}, [x10], #16
+.endif
+ b.gt 1b
+
+ subs w7, w7, #2
+ cmp w7, #1
+ b.lt 0f
+ mov w6, w9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs w6, w6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+.endif
+ b.gt 1b
+0:
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
new file mode 100644
index 0000000000..9f7b4e7a89
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -0,0 +1,3310 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ add \t0\().8h, \t0\().8h, \t2\().8h
+ add \t1\().8h, \t1\().8h, \t3\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #5
+ sqrshrun2 \dst\().16b, \t1\().8h, #5
+.endm
+
+.macro w_avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ sqdmulh \t1\().8h, \t1\().8h, v30.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro mask dst, t0, t1, t2, t3
+ ld1 {v30.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ mul v30.16b, v30.16b, v31.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ shll v28.8h, v30.8b, #8
+ shll2 v29.8h, v30.16b, #8
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v28.8h
+ sqdmulh \t1\().8h, \t1\().8h, v29.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ clz w4, w4
+.ifc \type, w_avg
+ dup v30.8h, w6
+ neg v30.8h, v30.8h
+ shl v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+ movi v31.16b, #256-2
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ ldrh w4, [x7, x4, lsl #1]
+ \type v4, v0, v1, v2, v3
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ cmp w5, #4
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ b.eq 0f
+ \type v5, v0, v1, v2, v3
+ cmp w5, #8
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ b.eq 0f
+ \type v4, v0, v1, v2, v3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.d}[0], [x0], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ subs w5, w5, #4
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v5, v0, v1, v2, v3
+ st1 {v4.16b}, [x0], x1
+ \type v6, v0, v1, v2, v3
+ st1 {v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ st1 {v6.16b}, [x0], x1
+ subs w5, w5, #4
+ st1 {v7.16b}, [x0], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 16b
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+32:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v6.16b,v7.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+64:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+128:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 320b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ clz w8, w4
+ adr x9, L(w_mask_\type\()_tbl)
+ sub w8, w8, #24
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ mov w10, #6903
+ dup v0.8h, w10
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x9
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v18.2d, v19.2d
+ trn2 v25.2d, v18.2d, v19.2d
+ add v24.8h, v24.8h, v25.8h
+ addp v18.8h, v24.8h, v24.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x12], x1
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ subs w5, w5, #2
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ add v18.8h, v18.8h, v19.8h
+ addp v18.8h, v18.8h, v18.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sub v6.8h, v6.8h, v4.8h
+ sub v7.8h, v7.8h, v5.8h
+ sub v18.8h, v18.8h, v16.8h
+ sub v19.8h, v19.8h, v17.8h
+ abs v20.8h, v6.8h
+ abs v21.8h, v7.8h
+ abs v22.8h, v18.8h
+ abs v23.8h, v19.8h
+ uqsub v20.8h, v0.8h, v20.8h
+ uqsub v21.8h, v0.8h, v21.8h
+ uqsub v22.8h, v0.8h, v22.8h
+ uqsub v23.8h, v0.8h, v23.8h
+ ushr v20.8h, v20.8h, #8
+ ushr v21.8h, v21.8h, #8
+ ushr v22.8h, v22.8h, #8
+ ushr v23.8h, v23.8h, #8
+ shl v24.8h, v20.8h, #9
+ shl v25.8h, v21.8h, #9
+ shl v26.8h, v22.8h, #9
+ shl v27.8h, v23.8h, #9
+ sqdmulh v24.8h, v24.8h, v6.8h
+ sqdmulh v25.8h, v25.8h, v7.8h
+ sqdmulh v26.8h, v26.8h, v18.8h
+ sqdmulh v27.8h, v27.8h, v19.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v16.8h
+ add v27.8h, v27.8h, v17.8h
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun v25.8b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun v27.8b, v27.8h, #4
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2
+ uzp1 v21.16b, v22.16b, v23.16b // Ditto
+ sub v20.16b, v1.16b, v20.16b
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h
+ sub v20.8h, v3.8h, v20.8h
+ rshrn v20.8b, v20.8h, #2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v24.8b, v25.8b}, [x0], #16
+ st1 {v26.8b, v27.8b}, [x12], #16
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.d}[0], [x2], #8
+ ld1 {v0.s}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v0.s}[1], [x8]
+ sub v3.8b, v4.8b, v2.8b
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ rshrn v6.8b, v5.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x5], #16
+ ld1 {v1.16b}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sub v3.16b, v4.16b, v2.16b
+ subs w4, w4, #2
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ umull2 v6.8h, v1.16b, v2.16b
+ umlal2 v6.8h, v0.16b, v3.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.d}[0], [x0], x1
+ st1 {v7.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x5], #32
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v0.16b}, [x0]
+ subs w4, w4, #2
+ sub v7.16b, v4.16b, v1.16b
+ sub v20.16b, v4.16b, v2.16b
+ ld1 {v3.16b}, [x8]
+ umull v16.8h, v5.8b, v1.8b
+ umlal v16.8h, v0.8b, v7.8b
+ umull2 v17.8h, v5.16b, v1.16b
+ umlal2 v17.8h, v0.16b, v7.16b
+ umull v21.8h, v6.8b, v2.8b
+ umlal v21.8h, v3.8b, v20.8b
+ umull2 v22.8h, v6.16b, v2.16b
+ umlal2 v22.8h, v3.16b, v20.16b
+ rshrn v18.8b, v16.8h, #6
+ rshrn2 v18.16b, v17.8h, #6
+ rshrn v19.8b, v21.8h, #6
+ rshrn2 v19.16b, v22.8h, #6
+ st1 {v18.16b}, [x0], x1
+ st1 {v19.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v20.16b, v21.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v22.16b, v23.16b}, [x8]
+ sub v5.16b, v4.16b, v0.16b
+ sub v6.16b, v4.16b, v1.16b
+ sub v30.16b, v4.16b, v2.16b
+ sub v31.16b, v4.16b, v3.16b
+ umull v24.8h, v16.8b, v0.8b
+ umlal v24.8h, v20.8b, v5.8b
+ umull2 v26.8h, v16.16b, v0.16b
+ umlal2 v26.8h, v20.16b, v5.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v21.8b, v6.8b
+ umull2 v7.8h, v17.16b, v1.16b
+ umlal2 v7.8h, v21.16b, v6.16b
+ umull v27.8h, v18.8b, v2.8b
+ umlal v27.8h, v22.8b, v30.8b
+ umull2 v1.8h, v18.16b, v2.16b
+ umlal2 v1.8h, v22.16b, v30.16b
+ umull v29.8h, v19.8b, v3.8b
+ umlal v29.8h, v23.8b, v31.8b
+ umull2 v21.8h, v19.16b, v3.16b
+ umlal2 v21.8h, v23.16b, v31.16b
+ rshrn v24.8b, v24.8h, #6
+ rshrn2 v24.16b, v26.8h, #6
+ rshrn v25.8b, v28.8h, #6
+ rshrn2 v25.16b, v7.8h, #6
+ rshrn v27.8b, v27.8h, #6
+ rshrn2 v27.16b, v1.8h, #6
+ rshrn v28.8b, v29.8h, #6
+ rshrn2 v28.16b, v21.8h, #6
+ st1 {v24.16b, v25.16b}, [x0], x1
+ st1 {v27.16b, v28.16b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 16b
+ .hword L(blend_tbl) - 8b
+ .hword L(blend_tbl) - 4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x5], #2
+ ld1 {v1.s}[0], [x2], #4
+ subs w4, w4, #2
+ ld1 {v2.h}[0], [x0]
+ zip1 v0.8b, v0.8b, v0.8b
+ sub v3.8b, v4.8b, v0.8b
+ ld1 {v2.h}[1], [x8]
+ umull v5.8h, v1.8b, v0.8b
+ umlal v5.8h, v2.8b, v3.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], x1
+ st1 {v5.h}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.8b, v1.8b}, [x5], #2
+ ld1 {v2.8b}, [x2], #8
+ subs w4, w4, #2
+ ext v0.8b, v0.8b, v1.8b, #4
+ ld1 {v3.s}[0], [x0]
+ sub v5.8b, v4.8b, v0.8b
+ ld1 {v3.s}[1], [x8]
+ umull v6.8h, v2.8b, v0.8b
+ umlal v6.8h, v3.8b, v5.8b
+ rshrn v6.8b, v6.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ext v0.16b, v0.16b, v1.16b, #8
+ sub v5.16b, v4.16b, v0.16b
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v6.8h, v0.8b, v2.8b
+ umlal v6.8h, v3.8b, v5.8b
+ umull2 v7.8h, v0.16b, v2.16b
+ umlal2 v7.8h, v3.16b, v5.16b
+ rshrn v16.8b, v6.8h, #6
+ rshrn2 v16.16b, v7.8h, #6
+ st1 {v16.d}[0], [x0], x1
+ st1 {v16.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ ld1 {v5.16b}, [x0]
+ sub v7.16b, v4.16b, v0.16b
+ sub v16.16b, v4.16b, v1.16b
+ ld1 {v6.16b}, [x8]
+ subs w4, w4, #2
+ umull v17.8h, v0.8b, v2.8b
+ umlal v17.8h, v5.8b, v7.8b
+ umull2 v18.8h, v0.16b, v2.16b
+ umlal2 v18.8h, v5.16b, v7.16b
+ umull v19.8h, v1.8b, v3.8b
+ umlal v19.8h, v6.8b, v16.8b
+ umull2 v20.8h, v1.16b, v3.16b
+ umlal2 v20.8h, v6.16b, v16.16b
+ rshrn v21.8b, v17.8h, #6
+ rshrn2 v21.16b, v18.8h, #6
+ rshrn v22.8b, v19.8h, #6
+ rshrn2 v22.16b, v20.8h, #6
+ st1 {v21.16b}, [x0], x1
+ st1 {v22.16b}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw
+ add x7, x2, w3, uxtw
+321:
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ mov w6, w3
+ sub v20.16b, v4.16b, v0.16b
+ sub v21.16b, v4.16b, v1.16b
+32:
+ ld1 {v16.16b, v17.16b}, [x2], #32
+ ld1 {v2.16b, v3.16b}, [x0]
+ subs w6, w6, #32
+ umull v23.8h, v0.8b, v16.8b
+ umlal v23.8h, v2.8b, v20.8b
+ ld1 {v18.16b, v19.16b}, [x7], #32
+ umull2 v27.8h, v0.16b, v16.16b
+ umlal2 v27.8h, v2.16b, v20.16b
+ ld1 {v6.16b, v7.16b}, [x8]
+ umull v24.8h, v0.8b, v17.8b
+ umlal v24.8h, v3.8b, v20.8b
+ umull2 v28.8h, v0.16b, v17.16b
+ umlal2 v28.8h, v3.16b, v20.16b
+ umull v25.8h, v1.8b, v18.8b
+ umlal v25.8h, v6.8b, v21.8b
+ umull2 v5.8h, v1.16b, v18.16b
+ umlal2 v5.8h, v6.16b, v21.16b
+ rshrn v29.8b, v23.8h, #6
+ rshrn2 v29.16b, v27.8h, #6
+ umull v26.8h, v1.8b, v19.8b
+ umlal v26.8h, v7.8b, v21.8b
+ umull2 v31.8h, v1.16b, v19.16b
+ umlal2 v31.8h, v7.16b, v21.16b
+ rshrn v30.8b, v24.8h, #6
+ rshrn2 v30.16b, v28.8h, #6
+ rshrn v23.8b, v25.8h, #6
+ rshrn2 v23.16b, v5.8h, #6
+ rshrn v24.8b, v26.8h, #6
+ st1 {v29.16b, v30.16b}, [x0], #32
+ rshrn2 v24.16b, v31.8h, #6
+ st1 {v23.16b, v24.16b}, [x8], #32
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw
+ add x7, x7, w3, uxtw
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.8b}, [x5]
+ sub v1.8b, v4.8b, v0.8b
+2:
+ ld1 {v2.h}[0], [x2], #2
+ ld1 {v3.b}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v2.b}[1], [x2]
+ ld1 {v3.b}[1], [x8]
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ add x2, x2, #2
+ st1 {v5.b}[0], [x0], x1
+ st1 {v5.b}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x5]
+ sub x1, x1, #2
+ sub v1.8b, v4.8b, v0.8b
+4:
+ ld1 {v2.8b}, [x2], #8
+ ld1 {v3.s}[0], [x0]
+ ld1 {v3.s}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], #2
+ st1 {v5.h}[2], [x8], #2
+ st1 {v5.b}[2], [x0], x1
+ st1 {v5.b}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2d}, [x5]
+ sub x1, x1, #4
+ sub v1.16b, v4.16b, v0.16b
+8:
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v0.8b, v2.8b
+ umlal v5.8h, v3.8b, v1.8b
+ umull2 v6.8h, v0.16b, v2.16b
+ umlal2 v6.8h, v3.16b, v1.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.s}[0], [x0], #4
+ st1 {v7.s}[2], [x8], #4
+ st1 {v7.h}[2], [x0], x1
+ st1 {v7.h}[6], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x5]
+ sub x1, x1, #8
+ sub v2.16b, v4.16b, v0.16b
+16:
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v7.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v16.16b}, [x8]
+ umull v17.8h, v5.8b, v0.8b
+ umlal v17.8h, v7.8b, v2.8b
+ umull2 v18.8h, v5.16b, v0.16b
+ umlal2 v18.8h, v7.16b, v2.16b
+ umull v20.8h, v6.8b, v0.8b
+ umlal v20.8h, v16.8b, v2.8b
+ umull2 v21.8h, v6.16b, v0.16b
+ umlal2 v21.8h, v16.16b, v2.16b
+ rshrn v19.8b, v17.8h, #6
+ rshrn2 v19.16b, v18.8h, #6
+ rshrn v22.8b, v20.8h, #6
+ rshrn2 v22.16b, v21.8h, #6
+ st1 {v19.8b}, [x0], #8
+ st1 {v22.8b}, [x8], #8
+ st1 {v19.s}[2], [x0], x1
+ st1 {v22.s}[2], [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x5]
+ sub x1, x1, #16
+ sub v2.16b, v4.16b, v0.16b
+ sub v3.8b, v4.8b, v1.8b
+32:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v5.16b, v6.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v20.16b, v21.16b}, [x8]
+ umull v22.8h, v16.8b, v0.8b
+ umlal v22.8h, v5.8b, v2.8b
+ umull2 v23.8h, v16.16b, v0.16b
+ umlal2 v23.8h, v5.16b, v2.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v6.8b, v3.8b
+ umull v30.8h, v18.8b, v0.8b
+ umlal v30.8h, v20.8b, v2.8b
+ umull2 v31.8h, v18.16b, v0.16b
+ umlal2 v31.8h, v20.16b, v2.16b
+ umull v25.8h, v19.8b, v1.8b
+ umlal v25.8h, v21.8b, v3.8b
+ rshrn v24.8b, v22.8h, #6
+ rshrn2 v24.16b, v23.8h, #6
+ rshrn v28.8b, v28.8h, #6
+ rshrn v30.8b, v30.8h, #6
+ rshrn2 v30.16b, v31.8h, #6
+ rshrn v27.8b, v25.8h, #6
+ st1 {v24.16b}, [x0], #16
+ st1 {v30.16b}, [x8], #16
+ st1 {v28.8b}, [x0], x1
+ st1 {v27.8b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x8 is set to (clz(w)-24).
+function put_neon
+ adr x9, L(put_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x2], x3
+ ld1 {v1.h}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.h}[0], [x0], x1
+ st1 {v1.h}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+16:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 160b
+ .hword L(put_tbl) - 8b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
+function prep_neon
+ adr x9, L(prep_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v1.s}[0], [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.4h, v1.4h}, [x0], #16
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+16:
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x9], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, w3, uxtw
+32:
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x7
+ ushll2 v17.8h, v2.16b, #4
+ st1 {v6.8h, v7.8h}, [x8], x7
+ ushll v18.8h, v3.8b, #4
+ st1 {v16.8h, v17.8h}, [x0], x7
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v18.8h, v19.8h}, [x8], x7
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #32
+ mov x6, #64
+64:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ldp q2, q3, [x1, #32]
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ add x1, x1, x2
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x6
+ ushll2 v17.8h, v2.16b, #4
+ ushll v18.8h, v3.8b, #4
+ st1 {v6.8h, v7.8h}, [x8], x6
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v16.8h, v17.8h}, [x0], x6
+ st1 {v18.8h, v19.8h}, [x8], x6
+ b.gt 64b
+ ret
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #64
+ mov x6, #128
+128:
+ ldp q0, q1, [x1]
+ ldp q2, q3, [x1, #32]
+ ushll v16.8h, v0.8b, #4
+ ushll2 v17.8h, v0.16b, #4
+ ushll v18.8h, v1.8b, #4
+ ushll2 v19.8h, v1.16b, #4
+ ushll v20.8h, v2.8b, #4
+ ushll2 v21.8h, v2.16b, #4
+ ldp q4, q5, [x1, #64]
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
+ ushll v22.8h, v3.8b, #4
+ ushll2 v23.8h, v3.16b, #4
+ ushll v24.8h, v4.8b, #4
+ ushll2 v25.8h, v4.16b, #4
+ ushll v26.8h, v5.8b, #4
+ ushll2 v27.8h, v5.16b, #4
+ ldp q6, q7, [x1, #96]
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
+ ushll v28.8h, v6.8b, #4
+ ushll2 v29.8h, v6.16b, #4
+ ushll v30.8h, v7.8b, #4
+ ushll2 v31.8h, v7.16b, #4
+ subs w4, w4, #1
+ add x1, x1, x2
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 1280b
+ .hword L(prep_tbl) - 640b
+ .hword L(prep_tbl) - 320b
+ .hword L(prep_tbl) - 160b
+ .hword L(prep_tbl) - 8b
+ .hword L(prep_tbl) - 4b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_h r0, r1, r2, r3, r4
+ interleave_1 .4h, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
+ trn1 \r0\wd, \r0\wd, \r2\wd
+ trn1 \r1\wd, \r1\wd, \r3\wd
+ trn1 \r2\wd, \r2\wd, \r4\wd
+ trn1 \r3\wd, \r3\wd, \r5\wd
+.endm
+.macro interleave_2_s r0, r1, r2, r3, r4, r5
+ interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5
+.endm
+.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
+ uxtl \r0\().8h, \r0\().8b
+ uxtl \r1\().8h, \r1\().8b
+.ifnb \r2
+ uxtl \r2\().8h, \r2\().8b
+ uxtl \r3\().8h, \r3\().8b
+.endif
+.ifnb \r4
+ uxtl \r4\().8h, \r4\().8b
+.endif
+.ifnb \r5
+ uxtl \r5\().8h, \r5\().8b
+.endif
+.ifnb \r6
+ uxtl \r6\().8h, \r6\().8b
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3, wd
+ mul \d\wd, \s0\wd, v0.h[0]
+ mla \d\wd, \s1\wd, v0.h[1]
+ mla \d\wd, \s2\wd, v0.h[2]
+ mla \d\wd, \s3\wd, v0.h[3]
+.endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
+.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s0\().4h, v0.h[0]
+ mla \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+ mla \d0\().4h, \s7\().4h, v0.h[7]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s1\().8h, v0.h[0]
+ mla \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+ mla \d1\().8h, \s8\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s2\().8h, v0.h[0]
+ mla \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+ mla \d1\().8h, \s9\().8h, v0.h[7]
+.endm
+.macro sqrshrun_b shift, r0, r1, r2, r3
+ sqrshrun \r0\().8b, \r0\().8h, #\shift
+.ifnb \r1
+ sqrshrun \r1\().8b, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().8b, \r2\().8h, #\shift
+ sqrshrun \r3\().8b, \r3\().8h, #\shift
+.endif
+.endm
+.macro srshr_h shift, r0, r1, r2, r3
+ srshr \r0\().8h, \r0\().8h, #\shift
+.ifnb \r1
+ srshr \r1\().8h, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ srshr \r2\().8h, \r2\().8h, #\shift
+ srshr \r3\().8h, \r3\().8h, #\shift
+.endif
+.endm
+.macro st_h strd, reg, lanes
+ st1 {\reg\().h}[0], [x0], \strd
+ st1 {\reg\().h}[1], [x8], \strd
+.if \lanes > 2
+ st1 {\reg\().h}[2], [x0], \strd
+ st1 {\reg\().h}[3], [x8], \strd
+.endif
+.endm
+.macro st_s strd, r0, r1
+ st1 {\r0\().s}[0], [x0], \strd
+ st1 {\r0\().s}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().s}[0], [x0], \strd
+ st1 {\r1\().s}[1], [x8], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x8], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1
+ st_s \strd, \r0, \r1
+.else
+ srshr_h 2, \r0, \r1
+ st_d \strd, \r0, \r1
+.endif
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x8], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x8], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x8], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x8], \strd
+.endif
+.endm
+.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1, \r2, \r3
+ st_8b \strd, \r0, \r1, \r2, \r3
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st_16b \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro shift_store_16 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun \r0\().8b, \r0\().8h, #6
+ sqrshrun2 \r0\().16b, \r1\().8h, #6
+ sqrshrun \r2\().8b, \r2\().8h, #6
+ sqrshrun2 \r2\().16b, \r3\().8h, #6
+ st_16b \strd, \r0, \r2
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st1 {\r0\().8h, \r1\().8h}, [x0], \strd
+ st1 {\r2\().8h, \r3\().8h}, [x8], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ mov x8, \type_h
+ mov x9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w10
+ mul \my, \my, w10
+ add \mx, \mx, w8 // mx, 8tap_h, 4tap_h
+ add \my, \my, w9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ tst \mx, #(0x7f << 14)
+ sub w8, w8, #24
+ movrel x10, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w9
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x10, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x9, L(\type\()_8tap_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ uxtl v4.8h, v4.8b
+ uxtl v6.8h, v6.8b
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ mul v3.4h, v3.4h, v0.h[0]
+ mla v3.4h, v4.4h, v0.h[1]
+ mla v3.4h, v6.4h, v0.h[2]
+ mla v3.4h, v7.4h, v0.h[3]
+ srshr v3.4h, v3.4h, #2
+ sqrshrun v3.8b, v3.8h, #4
+ st1 {v3.h}[0], [\dst], \d_strd
+ st1 {v3.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8b}, [\src], \s_strd
+ ld1 {v20.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v20.8h, v20.8b
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ mul v16.4h, v16.4h, v0.h[0]
+ mla v16.4h, v17.4h, v0.h[1]
+ mla v16.4h, v18.4h, v0.h[2]
+ mla v16.4h, v19.4h, v0.h[3]
+ mul v20.4h, v20.4h, v0.h[0]
+ mla v20.4h, v21.4h, v0.h[1]
+ mla v20.4h, v22.4h, v0.h[2]
+ mla v20.4h, v23.4h, v0.h[3]
+ srshr v16.4h, v16.4h, #2
+ srshr v20.4h, v20.4h, #2
+.ifc \type, put
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun v20.8b, v20.8h, #4
+ st1 {v16.s}[0], [\dst], \d_strd
+ st1 {v20.s}[0], [\ds2], \d_strd
+.else
+ st1 {v16.4h}, [\dst], \d_strd
+ st1 {v20.4h}, [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+8:
+ ld1 {v16.8b, v17.8b}, [\src], \s_strd
+ ld1 {v20.8b, v21.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v18.8h, v16.8h, v0.h[0]
+ mul v22.8h, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+ subs \h, \h, #2
+ srshr v18.8h, v18.8h, #2
+ srshr v22.8h, v22.8h, #2
+.ifc \type, put
+ sqrshrun v18.8b, v18.8h, #4
+ sqrshrun v22.8b, v22.8h, #4
+ st1 {v18.8b}, [\dst], \d_strd
+ st1 {v22.8b}, [\ds2], \d_strd
+.else
+ st1 {v18.8h}, [\dst], \d_strd
+ st1 {v22.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24
+ ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24
+ mov \mx, \w
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+
+16:
+ mul v24.8h, v16.8h, v0.h[0]
+ mul v25.8h, v17.8h, v0.h[0]
+ mul v26.8h, v20.8h, v0.h[0]
+ mul v27.8h, v21.8h, v0.h[0]
+.irpc i, 1234567
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ srshr v26.8h, v26.8h, #2
+ srshr v27.8h, v27.8h, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun2 v24.16b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun2 v26.16b, v27.8h, #4
+ st1 {v24.16b}, [\dst], #16
+ st1 {v26.16b}, [\ds2], #16
+.else
+ st1 {v24.8h, v25.8h}, [\dst], #32
+ st1 {v26.8h, v27.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ ld1 {v17.8b, v18.8b}, [\src], #16
+ ld1 {v21.8b, v22.8b}, [\sr2], #16
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_h v1, v2, v3, v4, v5
+ b.gt 24f
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .4h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_h \sr2, \src, \s_strd, v6, v7
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_h v1, v2, v3, v4, v5
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+216:
+ subs \h, \h, #4
+ load_h \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_h v7, v16, v17, v18, v19
+ interleave_2_s v5, v6, v7, v16, v17, v18
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_h \sr2, \src, \s_strd, v16, v17
+ interleave_1_h v7, v16, v17
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ shift_store_4 \type, \d_strd, v6
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ uxtl_b v5, v6
+ mul_mla_4 v7, v3, v4, v5, v6, .8h
+ shift_store_4 \type, \d_strd, v7
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ interleave_1_s v16, v17, v18
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v16, v17
+ uxtl_b v18, v19, v20, v21
+
+48:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v23, v24, v25, v26
+ interleave_1_s v22, v23, v24, v25, v26
+ uxtl_b v22, v23, v24, v25
+ mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ shift_store_4 \type, \d_strd, v1, v2
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v27, v16
+ subs \h, \h, #2
+ interleave_1_s v26, v27, v16
+ uxtl_b v26, v27
+ mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ shift_store_4 \type, \d_strd, v1
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v17, v18
+ subs \h, \h, #2
+ interleave_1_s v16, v17, v18
+ uxtl_b v16, v17
+ mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ shift_store_4 \type, \d_strd, v2
+ b.le 0f
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v19, v20, v21, v22
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v18, v19, v20, v21
+ mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ shift_store_4 \type, \d_strd, v1, v2
+ b.gt 48b
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4, v5
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4 v7, v2, v3, v4, v5, .8h
+ shift_store_8 \type, \d_strd, v6, v7
+ b.le 0f
+ load_8b \sr2, \src, \s_strd, v6, v7
+ uxtl_b v6, v7
+ mul_mla_4 v1, v3, v4, v5, v6, .8h
+ mul_mla_4 v2, v4, v5, v6, v7, .8h
+ shift_store_8 \type, \d_strd, v1, v2
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ uxtl_b v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v23, v24
+ uxtl_b v23, v24
+ mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v25, v26
+ uxtl_b v25, v26
+ mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v27, v16
+ uxtl_b v27, v16
+ mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v17, v18
+ uxtl_b v17, v18
+ mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #4
+ load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
+ uxtl_b v19, v20, v21, v22
+ mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.gt 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ cmp \h, #2
+ load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl v16.8h, v1.8b
+ uxtl v17.8h, v2.8b
+ uxtl v18.8h, v3.8b
+ uxtl v19.8h, v4.8b
+ uxtl v20.8h, v5.8b
+ uxtl2 v23.8h, v1.16b
+ uxtl2 v24.8h, v2.16b
+ uxtl2 v25.8h, v3.16b
+ uxtl2 v26.8h, v4.16b
+ uxtl2 v27.8h, v5.16b
+ mul_mla_4 v1, v16, v17, v18, v19, .8h
+ mul_mla_4 v16, v17, v18, v19, v20, .8h
+ mul_mla_4 v2, v23, v24, v25, v26, .8h
+ mul_mla_4 v17, v24, v25, v26, v27, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v16, v17
+ b.le 0f
+ load_16b \sr2, \src, \s_strd, v6, v7
+ uxtl v21.8h, v6.8b
+ uxtl v22.8h, v7.8b
+ uxtl2 v28.8h, v6.16b
+ uxtl2 v29.8h, v7.16b
+ mul_mla_4 v1, v18, v19, v20, v21, .8h
+ mul_mla_4 v3, v19, v20, v21, v22, .8h
+ mul_mla_4 v2, v25, v26, v27, v28, .8h
+ mul_mla_4 v4, v26, v27, v28, v29, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v3, v4
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+
+ bl L(\type\()_8tap_filter_2)
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v28.8b, #4
+ mov v21.8b, v28.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v30.8h, v30.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ ext v31.16b, v30.16b, v30.16b, #2
+ trn1 v27.2s, v28.2s, v30.2s
+ trn2 v30.2s, v28.2s, v30.2s
+ trn1 v28.2s, v29.2s, v31.2s
+ trn2 v31.2s, v29.2s, v31.2s
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v30.4h, v0.h[2]
+ mla v27.4h, v31.4h, v0.h[3]
+ srshr v28.4h, v27.4h, #2
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v28.4h, v1.h[2]
+ smlal v3.4s, v29.4h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v28.8b
+ mov v20.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v28.4h, v1.h[6]
+ smlal v3.4s, v29.4h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v26.8b}, [\sr2], \s_strd
+ ld1 {v27.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ uxtl v27.8h, v27.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ ext v28.16b, v27.16b, v27.16b, #2
+ ext v29.16b, v27.16b, v27.16b, #4
+ ext v30.16b, v27.16b, v27.16b, #6
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v29.4h, v0.h[2]
+ mla v27.4h, v30.4h, v0.h[3]
+ srshr v28.4h, v31.4h, #2
+ srshr v29.4h, v27.4h, #2
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v24.4h, v1.h[2]
+ smlal2 v5.4s, v24.8h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v24.16b
+ mov v20.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+ smlal v2.4s, v24.4h, v1.h[7]
+ smlal2 v3.4s, v24.8h, v1.h[7]
+ smlal v4.4s, v25.4h, v1.h[7]
+ smlal2 v5.4s, v25.8h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8_first):
+ ld1 {v28.8b, v29.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ mul v16.8h, v28.8h, v0.h[0]
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mla v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+ mla v16.8h, v26.8h, v0.h[7]
+ srshr v16.8h, v16.8h, #2
+ ret
+
+L(\type\()_8tap_filter_8):
+ ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
+ ld1 {v30.8b, v31.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ uxtl v30.8h, v30.8b
+ uxtl v31.8h, v31.8b
+ mul v24.8h, v28.8h, v0.h[0]
+ mul v25.8h, v30.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ dup v1.16b, \mx
+ dup v3.16b, \my
+ mov w9, #16
+ sub w8, w9, \mx
+ sub w9, w9, \my
+ dup v0.16b, w8
+ dup v2.16b, w9
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ sub w8, w8, #24
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x9, L(\type\()_bilin_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.s}[0], [\src], \s_strd
+ ld1 {v6.s}[0], [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.4h, v4.4h, v6.4h
+ trn1 v5.4h, v5.4h, v7.4h
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ld1 {v4.16b}, [\src], \s_strd
+ ld1 {v6.16b}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v6.16b, v6.16b, #1
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umull v6.8h, v6.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ umlal v6.8h, v7.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v6.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.d}[1], [\src], #8
+ ld1 {v20.d}[1], [\sr2], #8
+ mov \mx, \w
+
+16:
+ ld1 {v18.16b}, [\src], #16
+ ld1 {v22.16b}, [\sr2], #16
+ ext v17.16b, v16.16b, v18.16b, #8
+ ext v19.16b, v16.16b, v18.16b, #9
+ ext v21.16b, v20.16b, v22.16b, #8
+ ext v23.16b, v20.16b, v22.16b, #9
+ umull v16.8h, v17.8b, v0.8b
+ umull2 v17.8h, v17.16b, v0.16b
+ umull v20.8h, v21.8b, v0.8b
+ umull2 v21.8h, v21.16b, v0.16b
+ umlal v16.8h, v19.8b, v1.8b
+ umlal2 v17.8h, v19.16b, v1.16b
+ umlal v20.8h, v23.8b, v1.8b
+ umlal2 v21.8h, v23.16b, v1.16b
+ subs \mx, \mx, #16
+.ifc \type, put
+ uqrshrn v16.8b, v16.8h, #4
+ uqrshrn2 v16.16b, v17.8h, #4
+ uqrshrn v20.8b, v20.8h, #4
+ uqrshrn2 v20.16b, v21.8h, #4
+ st1 {v16.16b}, [\dst], #16
+ st1 {v20.16b}, [\ds2], #16
+.else
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v20.8h, v21.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x9, L(\type\()_bilin_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.h}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst]
+ st1 {v4.h}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ ld1 {v19.h}[0], [\sr2], \s_strd
+ ld1 {v20.h}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ trn1 v18.4h, v18.4h, v19.4h
+ trn1 v19.4h, v19.4h, v20.4h
+ trn1 v16.2s, v16.2s, v18.2s
+ trn1 v17.2s, v17.2s, v19.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ cmp \h, #2
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ st1 {v4.h}[2], [\dst], \d_strd
+ st1 {v4.h}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.s}[0], [\src], \s_strd
+4:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8b}, [\src], \s_strd
+8:
+ ld1 {v17.8b}, [\sr2], \s_strd
+ ld1 {v18.8b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull v5.8h, v17.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal v5.8h, v18.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v5.8b, v5.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.16b}, [\src], \s_strd
+2:
+ ld1 {v17.16b}, [\sr2], \s_strd
+ ld1 {v18.16b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull2 v5.8h, v16.16b, v2.16b
+ umull v6.8h, v17.8b, v2.8b
+ umull2 v7.8h, v17.16b, v2.16b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal2 v5.8h, v17.16b, v3.16b
+ umlal v6.8h, v18.8b, v3.8b
+ umlal2 v7.8h, v18.16b, v3.16b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn2 v4.16b, v5.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ uqrshrn2 v6.16b, v7.8h, #4
+ st1 {v4.16b}, [\dst], \d_strd
+ st1 {v6.16b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ adr x9, L(\type\()_bilin_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.s}[0], [\sr2], \s_strd
+ ld1 {v30.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.4h, v28.4h, v30.4h
+ trn1 v29.4h, v29.4h, v31.4h
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ uqrshrn v4.8b, v4.8h, #8
+ subs \h, \h, #2
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+4:
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.2s, v28.2s, v30.2s
+ trn1 v29.2s, v29.2s, v31.2s
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.16b}, [\sr2], \s_strd
+ ld1 {v30.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ ext v31.16b, v30.16b, v30.16b, #1
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+ umull v18.8h, v30.8b, v0.8b
+ umlal v18.8h, v31.8b, v1.8b
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ uqrshrn v5.8b, v5.8h, #8
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8b, v17.8b}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ load_filter_row d3, w12, w7
+ load_filter_row d4, w12, w7
+ load_filter_row d5, w12, w7
+ load_filter_row d6, w12, w7
+ // subtract by 128 to allow using smull
+ eor v16.8b, v16.8b, v22.8b
+ eor v17.8b, v17.8b, v22.8b
+ load_filter_row d7, w12, w7
+
+ ext v18.8b, v16.8b, v17.8b, #1
+ ext v19.8b, v16.8b, v17.8b, #2
+ smull v0.8h, v0.8b, v16.8b
+ smull v1.8h, v1.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #3
+ ext v20.8b, v16.8b, v17.8b, #4
+ smull v2.8h, v2.8b, v19.8b
+ smull v3.8h, v3.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #5
+ ext v19.8b, v16.8b, v17.8b, #6
+ smull v4.8h, v4.8b, v20.8b
+ smull v5.8h, v5.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #7
+ smull v6.8h, v6.8b, v19.8b
+ smull v7.8h, v7.8b, v18.8b
+
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+
+ addp v0.8h, v0.8h, v4.8h
+
+ add w5, w5, w8
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #3
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ movi v22.8b, #128
+.ifb \t
+ movi v23.8h, #128
+.else
+ movi v23.8h, #8, lsl #8
+.endif
+
+ bl warp_filter_horz_neon
+ srshr v24.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v25.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v26.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v27.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v28.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v29.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v30.8h, v0.8h, #3
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ srshr v31.8h, v0.8h, #3
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+ sqrshrn v16.4h, v16.4s, #\shift
+ mov v26.16b, v27.16b
+ sqrshrn2 v16.8h, v17.4s, #\shift
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ add v16.8h, v16.8h, v23.8h
+.ifb \t
+ sqxtun v16.8b, v16.8h
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+.ifnb \t
+ st1 {v16.8h}, [x0], x1
+.else
+ st1 {v16.8b}, [x0], x1
+.endif
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ret x15
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.16b}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.16b, v1.16b}, [x13], #32
+ subs x3, x3, #32
+ st1 {v0.16b, v1.16b}, [x12], #32
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2 // in + center_w
+ sub x3, x3, #1 // in + center_w - 1
+ add x12, x6, x4 // dst + left_ext
+ ld1r {v0.16b}, [x3]
+ add x12, x12, x2 // out = dst + left_ext + center_w
+ mov x3, x11
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.16b, v1.16b}, [x8], #32
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.16b, v1.16b}, [x14], #32
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
new file mode 100644
index 0000000000..1bfb12ebb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -0,0 +1,3611 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sqadd \t0\().8h, \t0\().8h, \t2\().8h
+ sqadd \t1\().8h, \t1\().8h, \t3\().8h
+ smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1)
+ sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v27.4s
+ mul \t0\().4s, \t0\().4s, v27.4s
+ mul \d1\().4s, \d1\().4s, v27.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #4
+ sshr \t0\().4s, \t0\().4s, #4
+ sshr \d1\().4s, \d1\().4s, #4
+ sshr \t1\().4s, \t1\().4s, #4
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+ ld1 {v27.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ neg v27.16b, v27.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sxtl v26.8h, v27.8b
+ sxtl2 v27.8h, v27.16b
+ sxtl v24.4s, v26.4h
+ sxtl2 v25.4s, v26.8h
+ sxtl v26.4s, v27.4h
+ sxtl2 v27.4s, v27.8h
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v24.4s
+ mul \t0\().4s, \t0\().4s, v25.4s
+ mul \d1\().4s, \d1\().4s, v26.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #6
+ sshr \t0\().4s, \t0\().4s, #6
+ sshr \d1\().4s, \d1\().4s, #6
+ sshr \t1\().4s, \t1\().4s, #6
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ clz w4, w4
+.ifnc \type, avg
+ dup v31.8h, \bdmax // bitdepth_max
+ movi v30.8h, #0
+.endif
+ clz w7, \bdmax
+ sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov w9, #1
+ mov w8, #-2*PREP_BIAS
+ lsl w9, w9, w7 // 1 << intermediate_bits
+ add w7, w7, #1
+ sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits
+ neg w7, w7 // -(intermediate_bits+1)
+ dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits
+ dup v29.8h, w7 // -(intermediate_bits+1)
+.else
+ mov w8, #PREP_BIAS
+ lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits
+ neg w7, w7 // -intermediate_bits
+ dup v28.8h, w8 // PREP_BIAS >> intermediate_bits
+ dup v29.8h, w7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ dup v27.4s, w6
+ neg v27.4s, v27.4s
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ \type v4, v5, v0, v1, v2, v3
+ ldrh w4, [x7, x4, lsl #1]
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ subs w5, w5, #4
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 4b
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 16b
+32:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+64:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+ mov x8, #128
+ sub x1, x1, #128
+128:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+ \type v4, v5, v0, v1, v2, v3
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 32b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w9, w4
+ adr x10, L(w_mask_\type\()_tbl)
+ dup v31.8h, w8 // bitdepth_max
+ sub w9, w9, #24
+ clz w8, w8 // clz(bitdepth_max)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov w9, #PREP_BIAS*64
+ neg w8, w8 // -sh
+ mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ dup v30.4s, w9 // PREP_BIAS*64
+ dup v29.4s, w8 // -sh
+ dup v0.8h, w11
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x10
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v20.2d, v21.2d
+ trn2 v25.2d, v20.2d, v21.2d
+ add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x12], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2
+ subs w5, w5, #2
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.8h}, [x0], x1
+ st1 {v5.8h}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw #1
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2
+ ld1 {v6.8h, v7.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v17.8h
+ ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v23.4s, v16.8h, v4.8h
+ ssubl v24.4s, v17.4h, v5.4h
+ ssubl2 v25.4s, v17.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v27.4s, v5.8h, #6 // tmp1 << 6
+ sshll v26.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v16.4s, v20.4h
+ uxtl2 v17.4s, v20.8h
+ uxtl v28.4s, v21.4h
+ mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v16.4s, v21.8h
+ mla v5.4s, v23.4s, v17.4s
+ mla v26.4s, v24.4s, v28.4s
+ mla v27.4s, v25.4s, v16.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v26.4s
+ sqxtun2 v5.8h, v27.4s
+
+ // Start of other half
+ sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2)
+ sabd v23.8h, v7.8h, v19.8h
+
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+
+ ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v18.8h, v6.8h
+ ssubl v18.4s, v19.4h, v7.4h
+ ssubl2 v19.4s, v19.8h, v7.8h
+ uqsub v22.8h, v0.8h, v22.8h // 27615 - abs()
+ uqsub v23.8h, v0.8h, v23.8h
+ sshll v24.4s, v6.4h, #6 // tmp1 << 6
+ sshll2 v25.4s, v6.8h, #6
+ sshll v26.4s, v7.4h, #6
+ sshll2 v27.4s, v7.8h, #6
+ ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v23.8h, v23.8h, #10
+ add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64
+ add v25.4s, v25.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v6.4s, v22.4h
+ uxtl2 v7.4s, v22.8h
+ uxtl v28.4s, v23.4h
+ mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v6.4s, v23.8h
+ mla v25.4s, v17.4s, v7.4s
+ mla v26.4s, v18.4s, v28.4s
+ mla v27.4s, v19.4s, v6.4s
+ srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v25.4s, v25.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v6.4h, v24.4s // iclip_pixel
+ sqxtun2 v6.8h, v25.4s
+ sqxtun v7.4h, v26.4s
+ sqxtun2 v7.8h, v27.4s
+ umin v6.8h, v6.8h, v31.8h // iclip_pixel
+ umin v7.8h, v7.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ uzp1 v21.16b, v22.16b, v23.16b
+ sub v20.16b, v1.16b, v20.16b // m
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition)
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v4.8h, v5.8h}, [x0], #32
+ st1 {v6.8h, v7.8h}, [x12], #32
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_16bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ add x8, x0, x1
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+4:
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ neg v2.8b, v2.8b // -m
+ subs w4, w4, #2
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+8:
+ ld1 {v4.16b}, [x5], #16
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v5.16b, v4.16b // -m
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ sxtl v4.8h, v5.8b
+ sxtl2 v5.8h, v5.16b
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ subs w4, w4, #2
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+16:
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #2
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ ld1 {v0.8h, v1.8h}, [x0]
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v2.8h, v3.8h}, [x8]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #1
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 160b
+ .hword L(blend_tbl) - 80b
+ .hword L(blend_tbl) - 40b
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.4h}, [x2], #8
+ ext v2.8b, v2.8b, v3.8b, #6
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.s}[0], [x0]
+ ld1 {v0.s}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.8h}, [x2], #16
+ ext v2.8b, v2.8b, v3.8b, #4
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v4.8b, v5.8b}, [x5], #2
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v4.8b, v4.8b // -m
+ neg v5.8b, v5.8b
+ ld1 {v0.8h}, [x0]
+ subs w4, w4, #2
+ sxtl v4.8h, v4.8b
+ sxtl v5.8h, v5.8b
+ ld1 {v1.8h}, [x8]
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v16.8b, v17.8b}, [x5], #2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ neg v16.8b, v16.8b // -m
+ neg v17.8b, v17.8b
+ ld1 {v0.8h, v1.8h}, [x0]
+ ld1 {v2.8h, v3.8h}, [x8]
+ subs w4, w4, #2
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v16.8h
+ sqrdmulh v6.8h, v6.8h, v17.8h
+ sqrdmulh v7.8h, v7.8h, v17.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw #1
+ add x7, x2, w3, uxtw #1
+321:
+ ld2r {v24.8b, v25.8b}, [x5], #2
+ mov w6, w3
+ neg v24.8b, v24.8b // -m
+ neg v25.8b, v25.8b
+ sxtl v24.8h, v24.8b
+ sxtl v25.8h, v25.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #32
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v19.8h, v3.8h, v19.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8]
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v24.8h
+ sqrdmulh v18.8h, v18.8h, v24.8h
+ sqrdmulh v19.8h, v19.8h, v24.8h
+ sub v20.8h, v4.8h, v20.8h // a - b
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sub v23.8h, v7.8h, v23.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v3.8h, v3.8h, v19.8h
+ sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v25.8h
+ sqrdmulh v23.8h, v23.8h, v25.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ add v7.8h, v7.8h, v23.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw #1
+ add x7, x7, w3, uxtw #1
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.8b}, [x5]
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+2:
+ ld1 {v1.s}[0], [x2], #4
+ ld1 {v0.h}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v1.h}[1], [x2]
+ ld1 {v0.h}[1], [x8]
+ add x2, x2, #4
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.h}[0], [x0], x1
+ st1 {v0.h}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.2s}, [x5]
+ sub x1, x1, #4
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+4:
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ subs w4, w4, #2
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.s}[0], [x0], #4
+ st1 {v0.s}[2], [x8], #4
+ st1 {v0.h}[2], [x0], x1
+ st1 {v0.h}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8b}, [x5]
+ sub x1, x1, #8
+ neg v4.8b, v4.8b // -m
+ sxtl v4.8h, v4.8b
+ shl v4.8h, v4.8h, #9 // -m << 9
+8:
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ subs w4, w4, #2
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v4.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.d}[0], [x0], #8
+ st1 {v1.d}[0], [x8], #8
+ st1 {v0.s}[2], [x0], x1
+ st1 {v1.s}[2], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b}, [x5]
+ sub x1, x1, #16
+ neg v17.16b, v16.16b // -m
+ sxtl v16.8h, v17.8b
+ sxtl2 v17.8h, v17.16b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.4h, v17.4h, #9
+16:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x8]
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.4h, v1.4h, v5.4h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.4h, v3.4h, v7.4h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.4h, v5.4h, v17.4h
+ sqrdmulh v6.8h, v6.8h, v16.8h
+ sqrdmulh v7.4h, v7.4h, v17.4h
+ add v0.8h, v0.8h, v4.8h
+ add v1.4h, v1.4h, v5.4h
+ add v2.8h, v2.8h, v6.8h
+ add v3.4h, v3.4h, v7.4h
+ st1 {v0.8h}, [x0], #16
+ st1 {v2.8h}, [x8], #16
+ st1 {v1.4h}, [x0], x1
+ st1 {v3.4h}, [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v24.16b, v25.16b}, [x5]
+ neg v26.16b, v24.16b // -m
+ neg v27.8b, v25.8b
+ sxtl v24.8h, v26.8b
+ sxtl2 v25.8h, v26.16b
+ sxtl v26.8h, v27.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+ shl v26.8h, v26.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h}, [x0]
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+ ld1 {v4.8h, v5.8h, v6.8h}, [x8]
+ subs w4, w4, #2
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v20.8h, v4.8h, v20.8h
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v25.8h
+ sqrdmulh v18.8h, v18.8h, v26.8h
+ sqrdmulh v20.8h, v20.8h, v24.8h
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v26.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ st1 {v0.8h, v1.8h, v2.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+ adr x10, L(put_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ br x10
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], x3
+ ld1 {v1.4h}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.4h}, [x0], x1
+ st1 {v1.4h}, [x0], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+8:
+ ld1 {v0.8h}, [x2], x3
+ ld1 {v1.8h}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ ldp q16, q17, [x2, #128]
+ stp q6, q7, [x0, #96]
+ ldp q18, q19, [x2, #160]
+ stp q16, q17, [x0, #128]
+ ldp q20, q21, [x2, #192]
+ stp q18, q19, [x0, #160]
+ ldp q22, q23, [x2, #224]
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 16b
+ .hword L(put_tbl) - 80b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+ adr x10, L(prep_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ dup v31.8h, w7 // intermediate_bits
+ movi v30.8h, #(PREP_BIAS >> 8), lsl #8
+ sub x10, x10, w9, uxtw
+ br x10
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+4:
+ ld1 {v0.d}[0], [x1], x2
+ ld1 {v0.d}[1], [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ st1 {v0.8h}, [x0], #16
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+8:
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sshl v1.8h, v1.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ add x1, x1, x2
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1]
+ add x1, x1, x2
+ subs w4, w4, #2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ add x1, x1, x2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ subs w4, w4, #1
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ add x1, x1, x2
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x0, x0, x8
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ ldp q16, q17, [x1, #128]
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ ldp q18, q19, [x1, #160]
+ sshl v16.8h, v16.8h, v31.8h
+ sshl v17.8h, v17.8h, v31.8h
+ ldp q20, q21, [x1, #192]
+ sshl v18.8h, v18.8h, v31.8h
+ sshl v19.8h, v19.8h, v31.8h
+ ldp q22, q23, [x1, #224]
+ add x1, x1, x2
+ sshl v20.8h, v20.8h, v31.8h
+ sshl v21.8h, v21.8h, v31.8h
+ sshl v22.8h, v22.8h, v31.8h
+ sshl v23.8h, v23.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ sub v16.8h, v16.8h, v30.8h
+ sub v17.8h, v17.8h, v30.8h
+ stp q6, q7, [x0, #96]
+ sub v18.8h, v18.8h, v30.8h
+ sub v19.8h, v19.8h, v30.8h
+ stp q16, q17, [x0, #128]
+ sub v20.8h, v20.8h, v30.8h
+ sub v21.8h, v21.8h, v30.8h
+ stp q18, q19, [x0, #160]
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x0, x0, x8
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 128b
+ .hword L(prep_tbl) - 64b
+ .hword L(prep_tbl) - 32b
+ .hword L(prep_tbl) - 16b
+ .hword L(prep_tbl) - 80b
+ .hword L(prep_tbl) - 40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+ ld1 {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+ ld1 {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+ umin \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ umin \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ umin \r2\wd, \r2\wd, \c\wd
+ umin \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+ sub \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ sub \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ sub \r2\wd, \r2\wd, \c\wd
+ sub \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+ smlal \d\().4s, \s7\().4h, v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+ smlal2 \d\().4s, \s7\().8h, v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+ sqrshrun \r0\().4h, \r0\().4s, #\shift
+.ifnb \r1
+ sqrshrun2 \r0\().8h, \r1\().4s, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().4h, \r2\().4s, #\shift
+ sqrshrun2 \r2\().8h, \r3\().4s, #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+ uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2
+.ifnb \r2
+ uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+ srshl \r0\().4s, \r0\().4s, \shift\().4s
+ srshl \r1\().4s, \r1\().4s, \shift\().4s
+.ifnb \r2
+ srshl \r2\().4s, \r2\().4s, \shift\().4s
+ srshl \r3\().4s, \r3\().4s, \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+ st1 {\reg\().s}[0], [x0], \strd
+ st1 {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+ st1 {\reg\().s}[2], [x0], \strd
+ st1 {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_d \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x9], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x9], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_8h \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin \r0\().8h, \r0\().8h, v31.8h
+ umin \r1\().8h, \r2\().8h, v31.8h
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub \r0\().8h, \r0\().8h, v29.8h
+ sub \r1\().8h, \r2\().8h, v29.8h
+.endif
+ st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ mov w9, \type_h
+ mov w10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w11
+ mul \my, \my, w11
+ add \mx, \mx, w9 // mx, 8tap_h, 4tap_h
+ add \my, \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ dup v31.8h, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w12, #6
+ tst \mx, #(0x7f << 14)
+ sub w9, w9, #24
+ add w13, w12, \bdmax // 6 + intermediate_bits
+ sub w12, w12, \bdmax // 6 - intermediate_bits
+ movrel x11, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w10
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x11, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x10, L(\type\()_8tap_h_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.8h, \bdmax // intermediate_bits
+.else
+ movi v28.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.8h, v29.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ smull v3.4s, v3.4h, v0.h[0]
+ smlal v3.4s, v4.4h, v0.h[1]
+ smlal v3.4s, v6.4h, v0.h[2]
+ smlal v3.4s, v7.4h, v0.h[3]
+ srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ srshl v3.4h, v3.4h, v29.4h // -intermediate_bits
+ umin v3.4h, v3.4h, v31.4h
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8h}, [\src], \s_strd
+ ld1 {v20.8h}, [\sr2], \s_strd
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ smull v16.4s, v16.4h, v0.h[0]
+ smlal v16.4s, v17.4h, v0.h[1]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[3]
+ smull v20.4s, v20.4h, v0.h[0]
+ smlal v20.4s, v21.4h, v0.h[1]
+ smlal v20.4s, v22.4h, v0.h[2]
+ smlal v20.4s, v23.4h, v0.h[3]
+ srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits)
+ srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v20.4s
+ srshl v16.8h, v16.8h, v29.8h // -intermediate_bits
+ umin v16.8h, v16.8h, v31.8h
+.else
+ uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2
+ sub v16.8h, v16.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v16.d}[0], [\dst], \d_strd
+ st1 {v16.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+81:
+ ld1 {v16.8h, v17.8h}, [\src], #32
+ ld1 {v20.8h, v21.8h}, [\sr2], #32
+ mov \mx, \w
+
+8:
+ smull v18.4s, v16.4h, v0.h[0]
+ smull2 v19.4s, v16.8h, v0.h[0]
+ smull v22.4s, v20.4h, v0.h[0]
+ smull2 v23.4s, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+ subs \mx, \mx, #8
+ srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
+ srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
+ srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits)
+ srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v18.4h, v18.4s
+ sqxtun2 v18.8h, v19.4s
+ sqxtun v22.4h, v22.4s
+ sqxtun2 v22.8h, v23.4s
+ srshl v18.8h, v18.8h, v29.8h // -intermediate_bits
+ srshl v22.8h, v22.8h, v29.8h // -intermediate_bits
+ umin v18.8h, v18.8h, v31.8h
+ umin v22.8h, v22.8h, v31.8h
+.else
+ uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2
+ uzp1 v22.8h, v22.8h, v23.8h // Ditto
+ sub v18.8h, v18.8h, v28.8h // PREP_BIAS
+ sub v22.8h, v22.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v18.8h}, [\dst], #16
+ st1 {v22.8h}, [\ds2], #16
+ b.le 9f
+
+ mov v16.16b, v17.16b
+ mov v20.16b, v21.16b
+ ld1 {v17.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 81b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+ dup v30.4s, w12 // 6 - intermediate_bits
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ adr x10, L(\type\()_8tap_v_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ b.gt 24f
+ smull_smlal_4 v6, v1, v2, v3, v4
+ sqrshrun_h 6, v6
+ umin_h v31, .8h, v6
+ st_s \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull_smlal_4 v17, v3, v4, v5, v6
+ sqrshrun_h 6, v16, v17
+ umin_h v31, .8h, v16
+ st_s \d_strd, v16, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_s v1, v2, v3, v4, v5
+ interleave_1_s v5, v6, v7
+216:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_s v7, v16, v17, v18, v19
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ sqrshrun_h 6, v24, v25
+ umin_h v31, .8h, v24
+ st_s \d_strd, v24, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_s \sr2, \src, \s_strd, v16, v17
+ interleave_1_s v7, v16, v17
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_h 6, v24
+ umin_h v31, .4h, v24
+ st_s \d_strd, v24, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4 v7, v2, v3, v4, v5
+ shift_store_4 \type, \d_strd, v6, v7
+ b.le 0f
+ load_4h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v1, v3, v4, v5, v6
+ smull_smlal_4 v2, v4, v5, v6, v7
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+ subs \h, \h, #4
+ load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_4 \type, \d_strd, v1, v2, v3, v4
+ b.le 0f
+ cmp \h, #2
+ mov v16.8b, v20.8b
+ mov v17.8b, v21.8b
+ mov v18.8b, v22.8b
+ mov v19.8b, v23.8b
+ mov v20.8b, v24.8b
+ mov v21.8b, v25.8b
+ mov v22.8b, v26.8b
+ b.eq 46f
+ b 48b
+46:
+ load_4h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull2_smlal2_4 v17, v1, v2, v3, v4
+ smull_smlal_4 v18, v2, v3, v4, v5
+ smull2_smlal2_4 v19, v2, v3, v4, v5
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+ b.le 0f
+ load_8h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v16, v3, v4, v5, v6
+ smull2_smlal2_4 v17, v3, v4, v5, v6
+ smull_smlal_4 v18, v4, v5, v6, v7
+ smull2_smlal2_4 v19, v4, v5, v6, v7
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v25, v26
+ smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+
+ load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+ load_16h \src, \src, \s_strd, v22, v23
+ subs \h, \h, #1
+ smull_smlal_4 v1, v16, v18, v20, v22
+ smull2_smlal2_4 v2, v16, v18, v20, v22
+ smull_smlal_4 v3, v17, v19, v21, v23
+ smull2_smlal2_4 v4, v17, v19, v21, v23
+ shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ b 16b
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+ adr x10, L(\type\()_8tap_hv_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.4s, w13 // 6 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.4s, v29.4s // -(6+intermediate_bits)
+.endif
+ br x10
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ bl L(\type\()_8tap_filter_2)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v24.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ umin v2.4h, v2.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v2.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+
+ bl L(\type\()_8tap_filter_2)
+ xtn v16.4h, v16.4s
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v24.8b, #4
+ mov v19.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v24.8b, #4
+ mov v21.8b, v24.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v24.8b, #4
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ umin v3.4h, v3.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v25.8h}, [\sr2], \s_strd
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v28.16b, v27.16b, v27.16b, #2
+ trn1 v24.2s, v25.2s, v27.2s
+ trn2 v27.2s, v25.2s, v27.2s
+ trn1 v25.2s, v26.2s, v28.2s
+ trn2 v28.2s, v26.2s, v28.2s
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v25.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v24.4h, v1.h[2]
+ smlal v3.4s, v25.4h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ umin v2.8h, v2.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ st1 {v2.d}[0], [\dst], \d_strd
+ st1 {v2.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v24.8b
+ mov v20.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+ smull v4.4s, v17.4h, v1.h[0]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[7]
+.ifc \type, put
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ sqxtun2 v3.8h, v4.4s
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v3.4h, v3.4s, #6
+ rshrn2 v3.8h, v4.4s, #6
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v3.d}[0], [\dst], \d_strd
+ st1 {v3.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v24.8h}, [\sr2], \s_strd
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v24.16b, v24.16b, #2
+ ext v27.16b, v24.16b, v24.16b, #4
+ ext v28.16b, v24.16b, v24.16b, #6
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v26.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ xtn v25.4h, v25.4s
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v23.4h, v1.h[2]
+ smlal2 v5.4s, v23.8h, v1.h[2]
+ smlal v2.4s, v23.4h, v1.h[3]
+ smlal2 v3.4s, v23.8h, v1.h[3]
+ smlal v4.4s, v24.4h, v1.h[3]
+ smlal2 v5.4s, v24.8h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v23.4h, v1.h[6]
+ smlal2 v5.4s, v23.8h, v1.h[6]
+ smlal v2.4s, v23.4h, v1.h[7]
+ smlal2 v3.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v24.4h, v1.h[7]
+ smlal2 v5.4s, v24.8h, v1.h[7]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8):
+ ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
+ ld1 {v6.8h, v7.8h}, [\src], \s_strd
+ smull v25.4s, v4.4h, v0.h[0]
+ smull2 v26.4s, v4.8h, v0.h[0]
+ smull v27.4s, v6.4h, v0.h[0]
+ smull2 v28.4s, v6.8h, v0.h[0]
+.irpc i, 1234567
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
+ srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
+ srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits)
+ uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2
+ uzp1 v24.8h, v27.8h, v28.8h // Ditto
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ dup v1.8h, \mx
+ dup v3.8h, \my
+ mov w10, #16
+ sub w9, w10, \mx
+ sub w10, w10, \my
+ dup v0.8h, w9
+ dup v2.8h, w10
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz \bdmax, \bdmax // bitdepth_max
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w11, #4
+ sub w9, w9, #24
+ sub w11, w11, \bdmax // 4 - intermediate_bits
+ add w12, \bdmax, #4 // 4 + intermediate_bits
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x10, L(\type\()_bilin_h_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.8h, \bdmax // intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.8h, v30.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.4h}, [\src], \s_strd
+ ld1 {v6.4h}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #2
+ ext v7.8b, v6.8b, v6.8b, #2
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ mul v4.4h, v4.4h, v0.4h
+ mla v4.4h, v5.4h, v1.4h
+ urshl v4.4h, v4.4h, v31.4h
+ urshl v4.4h, v4.4h, v30.4h
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ trn1 v4.2d, v4.2d, v6.2d
+ trn1 v5.2d, v5.2d, v7.2d
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ldr h5, [\src, #16]
+ ldr h7, [\sr2, #16]
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v5.16b, #2
+ ext v7.16b, v6.16b, v7.16b, #2
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ mul v6.8h, v6.8h, v0.8h
+ mla v6.8h, v7.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+ urshl v6.8h, v6.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+161:
+ ld1 {v16.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ mov \mx, \w
+
+16:
+ ld1 {v17.8h, v18.8h}, [\src], #32
+ ld1 {v22.8h, v23.8h}, [\sr2], #32
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v17.16b, v18.16b, #2
+ ext v24.16b, v21.16b, v22.16b, #2
+ ext v25.16b, v22.16b, v23.16b, #2
+ mul v16.8h, v16.8h, v0.8h
+ mla v16.8h, v19.8h, v1.8h
+ mul v17.8h, v17.8h, v0.8h
+ mla v17.8h, v20.8h, v1.8h
+ mul v21.8h, v21.8h, v0.8h
+ mla v21.8h, v24.8h, v1.8h
+ mul v22.8h, v22.8h, v0.8h
+ mla v22.8h, v25.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v21.8h, v21.8h, v31.8h
+ urshl v22.8h, v22.8h, v31.8h
+ subs \mx, \mx, #16
+.ifc \type, put
+ urshl v16.8h, v16.8h, v30.8h
+ urshl v17.8h, v17.8h, v30.8h
+ urshl v21.8h, v21.8h, v30.8h
+ urshl v22.8h, v22.8h, v30.8h
+.else
+ sub v16.8h, v16.8h, v29.8h
+ sub v17.8h, v17.8h, v29.8h
+ sub v21.8h, v21.8h, v29.8h
+ sub v22.8h, v22.8h, v29.8h
+.endif
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v21.8h, v22.8h}, [\ds2], #32
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v21.16b, v23.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ dup v31.8h, w11 // 4 - intermediate_bits
+.endif
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.s}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst]
+ st1 {v4.s}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ ld1 {v19.s}[0], [\sr2], \s_strd
+ ld1 {v20.s}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v19.2s, v19.2s, v20.2s
+ trn1 v16.2d, v16.2d, v18.2d
+ trn1 v17.2d, v17.2d, v19.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ cmp \h, #2
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ st1 {v4.s}[2], [\dst], \d_strd
+ st1 {v4.s}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.4h}, [\src], \s_strd
+4:
+ ld1 {v17.4h}, [\sr2], \s_strd
+ ld1 {v18.4h}, [\src], \s_strd
+ trn1 v16.2d, v16.2d, v17.2d
+ trn1 v17.2d, v17.2d, v18.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8h}, [\src], \s_strd
+8:
+ ld1 {v17.8h}, [\sr2], \s_strd
+ ld1 {v18.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.8h, v17.8h}, [\src], \s_strd
+2:
+ ld1 {v18.8h, v19.8h}, [\sr2], \s_strd
+ ld1 {v20.8h, v21.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v18.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v19.8h, v3.8h
+ mul v6.8h, v18.8h, v2.8h
+ mla v6.8h, v20.8h, v3.8h
+ mul v7.8h, v19.8h, v2.8h
+ mla v7.8h, v21.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ urshr v6.8h, v6.8h, #4
+ urshr v7.8h, v7.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+ urshl v7.8h, v7.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+ sub v7.8h, v7.8h, v29.8h
+.endif
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ adr x10, L(\type\()_bilin_hv_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.4s, w12 // 4 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.4s, v30.4s // -(4+intermediate_bits)
+.endif
+ br x10
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.4h}, [\src], \s_strd
+ ext v21.8b, v20.8b, v20.8b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+2:
+ ld1 {v22.4h}, [\sr2], \s_strd
+ ld1 {v24.4h}, [\src], \s_strd
+ ext v23.8b, v22.8b, v22.8b, #2
+ ext v25.8b, v24.8b, v24.8b, #2
+ trn1 v22.2s, v22.2s, v24.2s
+ trn1 v23.2s, v23.2s, v25.2s
+ mul v17.4h, v22.4h, v0.4h
+ mla v17.4h, v23.4h, v1.4h
+ urshl v17.4h, v17.4h, v31.4h
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ urshl v4.4s, v4.4s, v30.4s
+ xtn v4.4h, v4.4s
+ subs \h, \h, #2
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v20.16b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+4:
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v22.16b, #2
+ ext v25.16b, v24.16b, v24.16b, #2
+ trn1 v22.2d, v22.2d, v24.2d
+ trn1 v23.2d, v23.2d, v25.2d
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ldr h21, [\src, #16]
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v21.16b, #2
+ mul v16.8h, v20.8h, v0.8h
+ mla v16.8h, v21.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+
+2:
+ ldr h23, [\sr2, #16]
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ldr h25, [\src, #16]
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v23.16b, #2
+ ext v25.16b, v24.16b, v25.16b, #2
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ mul v18.8h, v24.8h, v0.8h
+ mla v18.8h, v25.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v18.8h, v18.8h, v31.8h
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+ umull v6.4s, v17.4h, v2.4h
+ umlal v6.4s, v18.4h, v3.4h
+ umull2 v7.4s, v17.8h, v2.8h
+ umlal2 v7.4s, v18.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ urshl v6.4s, v6.4s, v30.4s
+ urshl v7.4s, v7.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+ uzp1 v5.8h, v6.8h, v7.8h // Ditto
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ rshrn v5.4h, v6.4s, #4
+ rshrn2 v5.8h, v7.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8h, v17.8h}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ sxtl v0.8h, v0.8b
+ load_filter_row d3, w12, w7
+ sxtl v1.8h, v1.8b
+ load_filter_row d4, w12, w7
+ sxtl v2.8h, v2.8b
+ load_filter_row d5, w12, w7
+ sxtl v3.8h, v3.8b
+ load_filter_row d6, w12, w7
+ sxtl v4.8h, v4.8b
+ load_filter_row d7, w12, w7
+ sxtl v5.8h, v5.8b
+ ext v18.16b, v16.16b, v17.16b, #2*1
+ smull v8.4s, v16.4h, v0.4h
+ smull2 v9.4s, v16.8h, v0.8h
+ sxtl v6.8h, v6.8b
+ ext v19.16b, v16.16b, v17.16b, #2*2
+ smull v10.4s, v18.4h, v1.4h
+ smull2 v11.4s, v18.8h, v1.8h
+ sxtl v7.8h, v7.8b
+ ext v20.16b, v16.16b, v17.16b, #2*3
+ smull v0.4s, v19.4h, v2.4h
+ smull2 v1.4s, v19.8h, v2.8h
+ ext v21.16b, v16.16b, v17.16b, #2*4
+ addp v8.4s, v8.4s, v9.4s
+ smull v2.4s, v20.4h, v3.4h
+ smull2 v3.4s, v20.8h, v3.8h
+ ext v22.16b, v16.16b, v17.16b, #2*5
+ addp v9.4s, v10.4s, v11.4s
+ smull v10.4s, v21.4h, v4.4h
+ smull2 v11.4s, v21.8h, v4.8h
+ ext v23.16b, v16.16b, v17.16b, #2*6
+ addp v0.4s, v0.4s, v1.4s
+ smull v18.4s, v22.4h, v5.4h
+ smull2 v19.4s, v22.8h, v5.8h
+ ext v16.16b, v16.16b, v17.16b, #2*7
+ addp v1.4s, v2.4s, v3.4s
+ addp v2.4s, v10.4s, v11.4s
+ smull v20.4s, v23.4h, v6.4h
+ smull2 v21.4s, v23.8h, v6.8h
+ addp v3.4s, v18.4s, v19.4s
+ smull v22.4s, v16.4h, v7.4h
+ smull2 v23.4s, v16.8h, v7.8h
+ addp v4.4s, v20.4s, v21.4s
+ addp v5.4s, v22.4s, v23.4s
+
+ addp v8.4s, v8.4s, v9.4s
+ addp v0.4s, v0.4s, v1.4s
+ addp v2.4s, v2.4s, v3.4s
+ addp v4.4s, v4.4s, v5.4s
+
+ addp v16.4s, v8.4s, v0.4s
+ addp v17.4s, v2.4s, v4.4s
+
+ add w5, w5, w8
+
+ srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits)
+ srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits)
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+.ifb \t
+ dup v15.8h, w7 // bitdepth_max
+.else
+ movi v15.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ clz w7, w7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub w7, w7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg w8, w8 // -(7 + intermediate_bits)
+.endif
+ dup v14.4s, w7 // -(7 - intermediate_bits)
+.ifb \t
+ dup v13.4s, w8 // -(7 + intermediate_bits)
+.endif
+
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #6
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ bl warp_filter_horz_neon
+ uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2
+ bl warp_filter_horz_neon
+ uzp1 v25.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v26.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v27.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v28.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v29.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v30.8h, v16.8h, v17.8h // Ditto
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+.ifb \t
+ srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits)
+ srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits)
+.else
+ rshrn v16.4h, v16.4s, #7
+ rshrn2 v16.8h, v17.4s, #7
+.endif
+ mov v26.16b, v27.16b
+.ifb \t
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v17.4s
+.else
+ sub v16.8h, v16.8h, v15.8h // PREP_BIAS
+.endif
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+.ifb \t
+ umin v16.8h, v16.8h, v15.8h // bitdepth_max
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+ st1 {v16.8h}, [x0], x1
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+
+ ret x15
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.8h}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4, lsl #1 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+ subs x3, x3, #32
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2, lsl #1 // in + center_w
+ sub x3, x3, #2 // in + center_w - 1
+ add x12, x6, x4, lsl #1 // dst + left_ext
+ ld1r {v0.8h}, [x3]
+ add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w
+ mov x3, x11
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
new file mode 100644
index 0000000000..3a6cf900a9
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 8
+#define DIF 16
+#define RNG 24
+#define CNT 28
+#define ALLOW_UPDATE_CDF 32
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro ld1_n d0, d1, src, sz, n
+.if \n <= 8
+ ld1 {\d0\sz}, [\src]
+.else
+ ld1 {\d0\sz, \d1\sz}, [\src]
+.endif
+.endm
+
+.macro st1_n s0, s1, dst, sz, n
+.if \n <= 8
+ st1 {\s0\sz}, [\dst]
+.else
+ st1 {\s0\sz, \s1\sz}, [\dst]
+.endif
+.endm
+
+.macro ushr_n d0, d1, s0, s1, shift, sz, n
+ ushr \d0\sz, \s0\sz, \shift
+.if \n == 16
+ ushr \d1\sz, \s1\sz, \shift
+.endif
+.endm
+
+.macro add_n d0, d1, s0, s1, s2, s3, sz, n
+ add \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ add \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
+ sub \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sub \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro and_n d0, d1, s0, s1, s2, s3, sz, n
+ and \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ and \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
+ cmhs \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ cmhs \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
+ urhadd \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ urhadd \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
+ sshl \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sshl \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+ sqdmulh \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sqdmulh \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro str_n idx0, idx1, dstreg, dstoff, n
+ str \idx0, [\dstreg, \dstoff]
+.if \n == 16
+ str \idx1, [\dstreg, \dstoff + 16]
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update sz, szb, n
+ sub sp, sp, #48
+ add x8, x0, #RNG
+ ld1_n v0, v1, x1, \sz, \n // cdf
+ ld1r {v4\sz}, [x8] // rng
+ movrel x9, coeffs, 30
+ movi v31\sz, #0x7f, lsl #8 // 0x7f00
+ sub x9, x9, x2, lsl #1
+ mvni v30\sz, #0x3f // 0xffc0
+ and v7\szb, v4\szb, v31\szb // rng & 0x7f00
+ str h4, [sp, #14] // store original u = s->rng
+ and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
+
+ ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
+ sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add x8, x0, #DIF + 6
+
+ add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
+ movrel x8, bits
+ str_n q4, q5, sp, #16, \n // store v values to allow indexed access
+
+ ld1_n v16, v17, x8, .8h, \n
+
+ cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v
+
+ and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
+.if \n == 16
+ add v6.8h, v6.8h, v7.8h
+.endif
+ addv h6, v6.8h // Aggregate mask bits
+ ldr w4, [x0, #ALLOW_UPDATE_CDF]
+ umov w3, v6.h[0]
+ rbit w3, w3
+ clz w15, w3 // ret
+
+ cbz w4, L(renorm)
+ // update_cdf
+ ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
+ movi v5\szb, #0xff
+.if \n == 16
+ mov w4, #-5
+.else
+ mvn w14, w2
+ mov w4, #-4
+ cmn w14, #3 // set C if n_symbols <= 2
+.endif
+ urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr w14, w3, #4 // count >> 4
+ sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6\sz, w4 // -rate
+
+ sub w3, w3, w3, lsr #5 // count - (count == 32)
+ sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
+ sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
+ add w3, w3, #1 // count + (count < 32)
+ add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
+ st1_n v0, v1, x1, \sz, \n
+ strh w3, [x1, x2, lsl #1]
+.endm
+
+ decode_update .4h, .8b, 4
+
+L(renorm):
+ add x8, sp, #16
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+L(renorm2):
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ str w6, [x0, #CNT]
+ str x7, [x0, #DIF]
+
+ mov w0, w15
+ add sp, sp, #48
+ ret
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update .8h, .16b, 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update .8h, .16b, 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ ld1 {v0.4h}, [x1] // cdf
+ add x16, x0, #RNG
+ movi v31.4h, #0x7f, lsl #8 // 0x7f00
+ movrel x17, coeffs, 30-2*3
+ mvni v30.4h, #0x3f // 0xffc0
+ ldrh w9, [x1, #6] // count = cdf[n_symbols]
+ ld1r {v3.4h}, [x16] // rng
+ movrel x16, bits
+ ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
+ add x17, x0, #DIF + 6
+ ld1 {v16.8h}, [x16]
+ mov w13, #-24
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+ ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+1:
+ and v7.8b, v3.8b, v31.8b // rng & 0x7f00
+ sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ str h3, [sp, #14] // store original u = s->rng
+ cmhs v2.8h, v1.8h, v4.8h // c >= v
+ str q4, [sp, #16] // store v values to allow indexed access
+ and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
+ addv h6, v6.8h // Aggregate mask bits
+ umov w3, v6.h[0]
+ add w13, w13, #5
+ rbit w3, w3
+ add x8, sp, #16
+ clz w15, w3 // ret
+
+ cbz w10, 2f
+ // update_cdf
+ movi v5.8b, #0xff
+ mov w4, #-5
+ urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
+ sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
+ sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6.4h, w4 // -rate
+
+ sub w9, w9, w9, lsr #5 // count - (count == 32)
+ sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
+ sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
+ add w9, w9, #1 // count + (count < 32)
+ add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
+ st1 {v0.4h}, [x1]
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ strh w9, [x1, #6]
+
+2:
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ dup v3.4h, w4
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ lsl w15, w15, #1
+ sub w15, w15, #5
+ lsr x12, x7, #48
+ adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
+ dup v1.8h, w12
+ b.cc 1b // loop if !carry
+ add w13, w13, #30
+ str w6, [x0, #CNT]
+ add sp, sp, #48
+ str x7, [x0, #DIF]
+ lsr w0, w13, #1
+ ret
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ bic w4, w5, #0xff // r &= 0xff00
+ add w4, w4, #8
+ subs x8, x7, x4, lsl #47 // dif - vw
+ lsr w4, w4, #1 // v
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ bic w1, w1, #0x3f // f &= ~63
+ mul w4, w4, w1
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ ldr w9, [x1] // cdf[0-1]
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ and w2, w9, #0xffc0 // f &= ~63
+ mul w4, w4, w2
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+
+ cbz w10, L(renorm2)
+
+ lsr w2, w9, #16 // count = cdf[1]
+ and w9, w9, #0xffff // cdf[0]
+
+ sub w3, w2, w2, lsr #5 // count - (count >= 32)
+ lsr w2, w2, #4 // count >> 4
+ add w10, w3, #1 // count + (count < 32)
+ add w2, w2, #4 // rate = (count >> 4) | 4
+
+ sub w9, w9, w15 // cdf[0] -= bit
+ sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub w9, w9, w11 // cdf[0]
+
+ strh w9, [x1]
+ strh w10, [x1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S
new file mode 100644
index 0000000000..e905682f47
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ ld1 {v3.16b}, [x1]
+ clz w3, w3
+ adr x5, L(splat_tbl)
+ sub w3, w3, #26
+ ext v2.16b, v3.16b, v3.16b, #12
+ ldrh w3, [x5, w3, uxtw #1]
+ add w2, w2, w2, lsl #1
+ ext v0.16b, v2.16b, v3.16b, #4
+ sub x3, x5, w3, uxtw
+ ext v1.16b, v2.16b, v3.16b, #8
+ lsl w2, w2, #2
+ ext v2.16b, v2.16b, v3.16b, #12
+1:
+ ldr x1, [x0], #8
+ subs w4, w4, #1
+ add x1, x1, x2
+ br x3
+
+10:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x1]
+ str s2, [x1, #8]
+ b.gt 1b
+ ret
+20:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x1]
+ str d1, [x1, #16]
+ b.gt 1b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+160:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+80:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+40:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1]
+ b.gt 1b
+ ret
+
+L(splat_tbl):
+ .hword L(splat_tbl) - 320b
+ .hword L(splat_tbl) - 160b
+ .hword L(splat_tbl) - 80b
+ .hword L(splat_tbl) - 40b
+ .hword L(splat_tbl) - 20b
+ .hword L(splat_tbl) - 10b
+endfunc
+
+const mv_tbls, align=4
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+ .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
+endconst
+
+const mask_mult, align=4
+ .byte 1, 2, 1, 2, 0, 0, 0, 0
+endconst
+
+// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
+// refmvs_block **rr, const uint8_t *ref_sign,
+// int col_end8, int row_end8,
+// int col_start8, int row_start8)
+function save_tmvs_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ movi v30.8b, #0
+ ld1 {v31.8b}, [x3]
+ adr x8, L(save_tmvs_tbl)
+ movrel x16, mask_mult
+ movrel x13, mv_tbls
+ ld1 {v29.8b}, [x16]
+ ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign]
+ mov w15, #5
+ mov w14, #12*2
+ sxtw x4, w4
+ sxtw x6, w6
+ mul w1, w1, w15 // stride *= 5
+ sub w5, w5, w7 // h = row_end8 - row_start8
+ lsl w7, w7, #1 // row_start8 <<= 1
+1:
+ mov w15, #5
+ and w9, w7, #30 // (y & 15) * 2
+ ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
+ add x9, x9, #12 // &b[... + 1]
+ madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
+ madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
+
+ madd x3, x6, x15, x0 // &rp[x]
+
+2:
+ ldrb w11, [x9, #10] // cand_b->bs
+ ld1 {v0.16b}, [x9] // cand_b->mv
+ add x11, x8, w11, uxtw #2
+ ldr h1, [x9, #8] // cand_b->ref
+ ldrh w12, [x11] // bw8
+ mov x15, x8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ cmp x9, x10
+ mov v2.8b, v0.8b
+ b.ge 3f
+
+ ldrb w15, [x9, #10] // cand_b->bs
+ add x16, x9, #8
+ ld1 {v4.16b}, [x9] // cand_b->mv
+ add x15, x8, w15, uxtw #2
+ ld1 {v1.h}[1], [x16] // cand_b->ref
+ ldrh w12, [x15] // bw8
+ add x9, x9, w12, uxtw #1 // cand_b += bw8*2
+ trn1 v2.2d, v0.2d, v4.2d
+
+3:
+ abs v2.8h, v2.8h // abs(mv[].xy)
+ tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref]
+ ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12
+ umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2}
+ cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096
+ xtn v2.4h, v2.4s // abs() condition to 16 bit
+ and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1]
+ addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0]
+ umov w16, v1.h[0] // Extract case for first block
+ umov w17, v1.h[1]
+ ldrh w11, [x11, #2] // Fetch jump table entry
+ ldrh w15, [x15, #2]
+ ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case
+ ldr q5, [x13, w17, uxtw #4]
+ sub x11, x8, w11, uxtw // Find jump table target
+ sub x15, x8, w15, uxtw
+ tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block
+ tbl v4.16b, {v4.16b}, v5.16b
+
+ // v1 follows on v0, with another 3 full repetitions of the pattern.
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v5.16b, v4.16b, v4.16b, #1
+ // v2 ends with 3 complete repetitions of the pattern.
+ ext v2.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v4.16b, v5.16b, #4
+
+ blr x11
+ b.ge 4f // if (cand_b >= end)
+ mov v0.16b, v4.16b
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ cmp x9, x10
+ blr x15
+ b.lt 2b // if (cand_b < end)
+
+4:
+ subs w5, w5, #1 // h--
+ add w7, w7, #2 // y += 2
+ add x0, x0, x1 // rp += stride
+ b.gt 1b
+
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+10:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #4
+ st1 {v0.s}[0], [x3]
+ st1 {v0.b}[4], [x16]
+ add x3, x3, #5
+ ret
+20:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #8
+ st1 {v0.d}[0], [x3]
+ st1 {v0.h}[4], [x16]
+ add x3, x3, #2*5
+ ret
+40:
+ AARCH64_VALID_CALL_TARGET
+ st1 {v0.16b}, [x3]
+ str s1, [x3, #16]
+ add x3, x3, #4*5
+ ret
+80:
+ AARCH64_VALID_CALL_TARGET
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write the last few, overlapping with the first write.
+ stur q2, [x3, #(8*5-16)]
+ add x3, x3, #8*5
+ ret
+160:
+ AARCH64_VALID_CALL_TARGET
+ add x16, x3, #6*5
+ add x17, x3, #12*5
+ // This writes 6 full entries plus 2 extra bytes
+ st1 {v0.16b, v1.16b}, [x3]
+ // Write another 6 full entries, slightly overlapping with the first set
+ st1 {v0.16b, v1.16b}, [x16]
+ // Write 8 bytes (one full entry) after the first 12
+ st1 {v0.8b}, [x17]
+ // Write the last 3 entries
+ str q2, [x3, #(16*5-16)]
+ add x3, x3, #16*5
+ ret
+
+L(save_tmvs_tbl):
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 16 * 12
+ .hword L(save_tmvs_tbl) - 160b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 8 * 12
+ .hword L(save_tmvs_tbl) - 80b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 4 * 12
+ .hword L(save_tmvs_tbl) - 40b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 2 * 12
+ .hword L(save_tmvs_tbl) - 20b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+ .hword 1 * 12
+ .hword L(save_tmvs_tbl) - 10b
+endfunc
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
new file mode 100644
index 0000000000..9013fd4b1e
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_64_UTIL_S
+#define DAV1D_SRC_ARM_64_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel rd, val, offset=0
+#if defined(__APPLE__)
+ .if \offset < 0
+ adrp \rd, \val@PAGE
+ add \rd, \rd, \val@PAGEOFF
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)@PAGE
+ add \rd, \rd, \val+(\offset)@PAGEOFF
+ .endif
+#elif defined(PIC) && defined(_WIN32)
+ .if \offset < 0
+ adrp \rd, \val
+ add \rd, \rd, :lo12:\val
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+ .endif
+#elif defined(PIC)
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+#else
+ ldr \rd, =\val+\offset
+#endif
+.endm
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub x16, sp, #4096
+ ldr xzr, [x16]
+ sub sp, x16, #(\space - 4096)
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
+ // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
+ zip1 \r0\().16b, \r0\().16b, \r1\().16b
+ // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
+ zip1 \r2\().16b, \r2\().16b, \r3\().16b
+ // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
+ zip1 \r4\().16b, \r4\().16b, \r5\().16b
+ // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
+ zip1 \r6\().16b, \r6\().16b, \r7\().16b
+
+ // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
+ trn1 \r1\().8h, \r0\().8h, \r2\().8h
+ // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
+ trn2 \r3\().8h, \r0\().8h, \r2\().8h
+ // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
+ trn1 \r5\().8h, \r4\().8h, \r6\().8h
+ // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
+ trn2 \r7\().8h, \r4\().8h, \r6\().8h
+
+ // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
+ trn1 \r0\().4s, \r1\().4s, \r5\().4s
+ // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
+ trn2 \r2\().4s, \r1\().4s, \r5\().4s
+ // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
+ trn1 \r1\().4s, \r3\().4s, \r7\().4s
+ // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
+ trn2 \r3\().4s, \r3\().4s, \r7\().4s
+
+ \xtl\()2 \r4\().8h, \r0\().16b
+ \xtl \r0\().8h, \r0\().8b
+ \xtl\()2 \r6\().8h, \r2\().16b
+ \xtl \r2\().8h, \r2\().8b
+ \xtl\()2 \r5\().8h, \r1\().16b
+ \xtl \r1\().8h, \r1\().8b
+ \xtl\()2 \r7\().8h, \r3\().16b
+ \xtl \r3\().8h, \r3\().8b
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \r0\().2d, \r3\().2d, \r4\().2d
+ trn2 \r4\().2d, \r3\().2d, \r4\().2d
+ trn1 \r1\().2d, \r5\().2d, \r6\().2d
+ trn2 \r5\().2d, \r5\().2d, \r6\().2d
+ trn2 \r6\().2d, \t8\().2d, \r2\().2d
+ trn1 \r2\().2d, \t8\().2d, \r2\().2d
+ trn1 \r3\().2d, \t9\().2d, \r7\().2d
+ trn2 \r7\().2d, \t9\().2d, \r7\().2d
+.endm
+
+.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().16b, \r0\().16b, \r1\().16b
+ trn2 \t9\().16b, \r0\().16b, \r1\().16b
+ trn1 \r1\().16b, \r2\().16b, \r3\().16b
+ trn2 \r3\().16b, \r2\().16b, \r3\().16b
+ trn1 \r0\().16b, \r4\().16b, \r5\().16b
+ trn2 \r5\().16b, \r4\().16b, \r5\().16b
+ trn1 \r2\().16b, \r6\().16b, \r7\().16b
+ trn2 \r7\().16b, \r6\().16b, \r7\().16b
+
+ trn1 \r4\().8h, \r0\().8h, \r2\().8h
+ trn2 \r2\().8h, \r0\().8h, \r2\().8h
+ trn1 \r6\().8h, \r5\().8h, \r7\().8h
+ trn2 \r7\().8h, \r5\().8h, \r7\().8h
+ trn1 \r5\().8h, \t9\().8h, \r3\().8h
+ trn2 \t9\().8h, \t9\().8h, \r3\().8h
+ trn1 \r3\().8h, \t8\().8h, \r1\().8h
+ trn2 \t8\().8h, \t8\().8h, \r1\().8h
+
+ trn1 \r0\().4s, \r3\().4s, \r4\().4s
+ trn2 \r4\().4s, \r3\().4s, \r4\().4s
+ trn1 \r1\().4s, \r5\().4s, \r6\().4s
+ trn2 \r5\().4s, \r5\().4s, \r6\().4s
+ trn2 \r6\().4s, \t8\().4s, \r2\().4s
+ trn1 \r2\().4s, \t8\().4s, \r2\().4s
+ trn1 \r3\().4s, \t9\().4s, \r7\().4s
+ trn2 \r7\().4s, \t9\().4s, \r7\().4s
+.endm
+
+.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().16b, \r0\().16b, \r1\().16b
+ trn2 \t5\().16b, \r0\().16b, \r1\().16b
+ trn1 \t6\().16b, \r2\().16b, \r3\().16b
+ trn2 \t7\().16b, \r2\().16b, \r3\().16b
+
+ trn1 \r0\().8h, \t4\().8h, \t6\().8h
+ trn2 \r2\().8h, \t4\().8h, \t6\().8h
+ trn1 \r1\().8h, \t5\().8h, \t7\().8h
+ trn2 \r3\().8h, \t5\().8h, \t7\().8h
+.endm
+
+.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4h, \r0\().4h, \r1\().4h
+ trn2 \t5\().4h, \r0\().4h, \r1\().4h
+ trn1 \t6\().4h, \r2\().4h, \r3\().4h
+ trn2 \t7\().4h, \r2\().4h, \r3\().4h
+
+ trn1 \r0\().2s, \t4\().2s, \t6\().2s
+ trn2 \r2\().2s, \t4\().2s, \t6\().2s
+ trn1 \r1\().2s, \t5\().2s, \t7\().2s
+ trn2 \r3\().2s, \t5\().2s, \t7\().2s
+.endm
+
+.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4s, \r0\().4s, \r1\().4s
+ trn2 \t5\().4s, \r0\().4s, \r1\().4s
+ trn1 \t6\().4s, \r2\().4s, \r3\().4s
+ trn2 \t7\().4s, \r2\().4s, \r3\().4s
+
+ trn1 \r0\().2d, \t4\().2d, \t6\().2d
+ trn2 \r2\().2d, \t4\().2d, \t6\().2d
+ trn1 \r1\().2d, \t5\().2d, \t7\().2d
+ trn2 \r3\().2d, \t5\().2d, \t7\().2d
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
+#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/third_party/dav1d/src/arm/asm-offsets.h b/third_party/dav1d/src/arm/asm-offsets.h
new file mode 100644
index 0000000000..2f3c3caa1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm-offsets.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARM_ASM_OFFSETS_H
+#define ARM_ASM_OFFSETS_H
+
+#define FGD_SEED 0
+#define FGD_AR_COEFF_LAG 92
+#define FGD_AR_COEFFS_Y 96
+#define FGD_AR_COEFFS_UV 120
+#define FGD_AR_COEFF_SHIFT 176
+#define FGD_GRAIN_SCALE_SHIFT 184
+
+#define FGD_SCALING_SHIFT 88
+#define FGD_UV_MULT 188
+#define FGD_UV_LUMA_MULT 196
+#define FGD_UV_OFFSET 204
+#define FGD_CLIP_TO_RESTRICTED_RANGE 216
+
+#endif /* ARM_ASM_OFFSETS_H */
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
new file mode 100644
index 0000000000..dc50415f1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_ASM_S
+#define DAV1D_SRC_ARM_ASM_S
+
+#include "config.h"
+
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+
+/* Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ * .global f
+ * f:
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ * .globl return_zero
+ * return_zero:
+ * AARCH64_VALID_CALL_TARGET
+ * mov x0, #0
+ * ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ * .globl with_early_jump
+ * with_early_jump:
+ * AARCH64_VALID_CALL_TARGET
+ * cmp x0, #128
+ * b.lt .Lwith_early_jump_128
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * .Lwith_early_jump_128:
+ * ...
+ * ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
+#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
+#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET
+#define AARCH64_VALID_CALL_TARGET
+#define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+ .pushsection .note.gnu.property, "a"
+ .balign 8
+ .long 4
+ .long 0x10
+ .long 0x5
+ .asciz "GNU"
+ .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+ .long 4
+ .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+ .long 0
+ .popsection
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+#endif /* ARCH_AARCH64 */
+
+#if ARCH_ARM
+ .syntax unified
+#ifdef __ELF__
+ .arch armv7-a
+ .fpu neon
+ .eabi_attribute 10, 0 // suppress Tag_FP_arch
+ .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
+ .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+#endif /* __ELF__ */
+
+#ifdef _WIN32
+#define CONFIG_THUMB 1
+#else
+#define CONFIG_THUMB 0
+#endif
+
+#if CONFIG_THUMB
+ .thumb
+#define A @
+#define T
+#else
+#define A
+#define T @
+#endif /* CONFIG_THUMB */
+#endif /* ARCH_ARM */
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, align=2
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+#if HAVE_AS_FUNC
+ .endfunc
+#endif
+ .purgem endfunc
+ .endm
+ .text
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+#if HAVE_AS_FUNC
+ .func EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+#if HAVE_AS_FUNC
+ .func \name
+#endif
+ .endif
+\name:
+#if ARCH_AARCH64
+ .if \export
+ AARCH64_VALID_CALL_TARGET
+ .endif
+#endif
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+#ifdef __APPLE__
+#define L(x) L ## x
+#else
+#define L(x) .L ## x
+#endif
+
+#define X(x) CONCAT(EXTERN, x)
+
+
+#endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/third_party/dav1d/src/arm/cdef.h b/third_party/dav1d/src/arm/cdef.h
new file mode 100644
index 0000000000..2e8c8ab6fb
--- /dev/null
+++ b/third_party/dav1d/src/arm/cdef.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride) \
+static void \
+cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
+ BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \
+ left, top, bottom, h, edges); \
+ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
+ sec_strength, dir, damping, h, edges \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->dir = BF(dav1d_cdef_find_dir, neon);
+ c->fb[0] = cdef_filter_8x8_neon;
+ c->fb[1] = cdef_filter_4x8_neon;
+ c->fb[2] = cdef_filter_4x4_neon;
+}
diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c
new file mode 100644
index 0000000000..b7a0d3adbc
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/arm/cpu.h"
+
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+// NEON is always available; runtime tests are not needed.
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#include <sys/auxv.h>
+
+#ifndef HWCAP_ARM_NEON
+#define HWCAP_ARM_NEON (1 << 12)
+#endif
+#define NEON_HWCAP HWCAP_ARM_NEON
+
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
+#elif defined(__ANDROID__)
+#include <stdio.h>
+#include <string.h>
+
+static unsigned parse_proc_cpuinfo(const char *flag) {
+ FILE *file = fopen("/proc/cpuinfo", "r");
+ if (!file)
+ return 0;
+
+ char line_buffer[120];
+ const char *line;
+
+ while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
+ if (strstr(line, flag)) {
+ fclose(file);
+ return 1;
+ }
+ // if line is incomplete seek back to avoid splitting the search
+ // string into two buffers
+ if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ // use fseek since the 64 bit fseeko is only available since
+ // Android API level 24 and meson defines _FILE_OFFSET_BITS
+ // by default 64
+ if (fseek(file, -strlen(flag), SEEK_CUR))
+ break;
+ }
+ }
+
+ fclose(file);
+
+ return 0;
+}
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = 0;
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__ANDROID__)
+ flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#endif
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h
new file mode 100644
index 0000000000..8c10a1b6b0
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_CPU_H
+#define DAV1D_SRC_ARM_CPU_H
+
+enum CpuFlags {
+ DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_arm(void);
+
+#endif /* DAV1D_SRC_ARM_CPU_H */
diff --git a/third_party/dav1d/src/arm/filmgrain.h b/third_party/dav1d/src/arm/filmgrain.h
new file mode 100644
index 0000000000..9f51b0310f
--- /dev/null
+++ b/third_party/dav1d/src/arm/filmgrain.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+#include "asm-offsets.h"
+
+CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
+
+CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
+CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
+
+void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX);
+
+#define GEN_GRAIN_UV(suff) \
+void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, \
+ const intptr_t uv \
+ HIGHBD_DECL_SUFFIX)
+
+GEN_GRAIN_UV(420);
+GEN_GRAIN_UV(422);
+GEN_GRAIN_UV(444);
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// same layout of parameters on the stack across platforms.
+void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
+ const pixel *const src,
+ const ptrdiff_t stride,
+ const uint8_t scaling[SCALING_SIZE],
+ const int scaling_shift,
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[][2],
+ const int h, const ptrdiff_t clip,
+ const ptrdiff_t type
+ HIGHBD_DECL_SUFFIX);
+
+static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in FG_BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ int type = 0;
+ if (data->overlap_flag && row_num)
+ type |= 1; /* overlap y */
+ if (data->overlap_flag && bx)
+ type |= 2; /* overlap x */
+
+ BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
+ scaling, data->scaling_shift,
+ grain_lut, offsets, bh,
+ data->clip_to_restricted_range, type
+ HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
+static void \
+fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
+ const size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], const int bh, \
+ const int row_num, const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, const int uv, const int is_id \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ const int rows = 1 + (data->overlap_flag && row_num > 0); \
+ \
+ /* seed[0] contains the current row, seed[1] contains the previous */ \
+ unsigned seed[2]; \
+ for (int i = 0; i < rows; i++) { \
+ seed[i] = data->seed; \
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
+ } \
+ \
+ int offsets[2 /* col offset */][2 /* row offset */]; \
+ \
+ /* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \
+ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \
+ if (data->overlap_flag && bx) { \
+ /* shift previous offsets left */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[1][i] = offsets[0][i]; \
+ } \
+ \
+ /* update current offsets */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[0][i] = get_random_number(8, &seed[i]); \
+ \
+ int type = 0; \
+ if (data->overlap_flag && row_num) \
+ type |= 1; /* overlap y */ \
+ if (data->overlap_flag && bx) \
+ type |= 2; /* overlap x */ \
+ if (data->chroma_scaling_from_luma) \
+ type |= 4; \
+ \
+ BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
+ scaling, data, grain_lut, \
+ luma_row + (bx << sx), luma_stride, \
+ offsets, bh, uv, is_id, type \
+ HIGHBD_TAIL_SUFFIX); \
+ } \
+}
+
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
+
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
+
+ c->fgy_32x32xn = fgy_32x32xn_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
+}
diff --git a/third_party/dav1d/src/arm/ipred.h b/third_party/dav1d/src/arm/ipred.h
new file mode 100644
index 0000000000..9c2aae748d
--- /dev/null
+++ b/third_party/dav1d/src/arm/ipred.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+#if ARCH_AARCH64
+void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
+ const pixel *const in,
+ const int end HIGHBD_DECL_SUFFIX);
+void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
+ const pixel *const in,
+ const int end, const int strength);
+void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
+ const int n);
+void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+
+static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ int dx = dav1d_dr_intra_derivative[angle >> 1];
+ pixel top_out[64 + 64 + (64+15)*2 + 16];
+ int max_base_x;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, 90 - angle, is_sm) : 0;
+ if (upsample_above) {
+ BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_x = 2 * (width + height) - 2;
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+ if (filter_strength) {
+ BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height),
+ filter_strength);
+ max_base_x = width + height - 1;
+ } else {
+ max_base_x = width + imin(width, height) - 1;
+ memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel));
+ }
+ }
+ const int base_inc = 1 + upsample_above;
+ int pad_pixels = width + 15; // max(dx >> 6) == 15
+ BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
+ top_out[max_base_x], pad_pixels * base_inc);
+ if (upsample_above)
+ BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+ else
+ BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+}
+
+void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src,
+ const int n);
+
+void BF(dav1d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz,
+ const pixel *const in
+ HIGHBD_DECL_SUFFIX);
+
+void BF(dav1d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+void BF(dav1d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+void BF(dav1d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top,
+ const pixel *const left,
+ const int width, const int height,
+ const int dx, const int dy);
+
+static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 90 && angle < 180);
+ int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+ int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, 180 - angle, is_sm) : 0;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 90, is_sm) : 0;
+ pixel buf[3*(64+1)];
+ pixel *left = &buf[2*(64+1)];
+ // The asm can underread below the start of top[] and left[]; to avoid
+ // surprising behaviour, make sure this is within the allocated stack space.
+ pixel *top = &buf[1*(64+1)];
+ pixel *flipped = &buf[0*(64+1)];
+
+ if (upsample_above) {
+ BF(dav1d_ipred_z2_upsample_edge, neon)(top, width, topleft_in
+ HIGHBD_TAIL_SUFFIX);
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 90, is_sm) : 0;
+
+ if (filter_strength) {
+ BF(dav1d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width),
+ topleft_in, width,
+ filter_strength);
+ if (max_width < width)
+ memcpy(&top[1 + max_width], &topleft_in[1 + max_width],
+ (width - max_width) * sizeof(pixel));
+ } else {
+ pixel_copy(&top[1], &topleft_in[1], width);
+ }
+ }
+ if (upsample_left) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height);
+ BF(dav1d_ipred_z2_upsample_edge, neon)(left, height, flipped
+ HIGHBD_TAIL_SUFFIX);
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 180 - angle, is_sm) : 0;
+
+ if (filter_strength) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height);
+ BF(dav1d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height),
+ flipped, height,
+ filter_strength);
+ if (max_height < height)
+ memcpy(&left[1 + max_height], &flipped[1 + max_height],
+ (height - max_height) * sizeof(pixel));
+ } else {
+ BF(dav1d_ipred_reverse, neon)(&left[1], &topleft_in[0],
+ height);
+ }
+ }
+ top[0] = left[0] = *topleft_in;
+
+ assert(!(upsample_above && upsample_left));
+ if (!upsample_above && !upsample_left) {
+ BF(dav1d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ } else if (upsample_above) {
+ BF(dav1d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ } else /*if (upsample_left)*/ {
+ BF(dav1d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height,
+ dx, dy);
+ }
+}
+
+void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+
+static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 180);
+ int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+ pixel flipped[64 + 64 + 16];
+ pixel left_out[64 + 64 + (64+15)*2];
+ int max_base_y;
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 180, is_sm) : 0;
+ if (upsample_left) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_y = 2 * (width + height) - 2;
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+ if (filter_strength) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height),
+ filter_strength);
+ max_base_y = width + height - 1;
+ } else {
+ BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0],
+ height + imin(width, height));
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_left;
+ // The tbx based implementation needs left[] to have 64 bytes intitialized,
+ // the other implementation can read height + max(dy >> 6) past the end.
+ int pad_pixels = imax(64 - max_base_y - 1, height + 15);
+
+ BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
+ left_out[max_base_y], pad_pixels * base_inc);
+ if (upsample_left)
+ BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+ else
+ BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+}
+#endif
+
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
+ c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
+ c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
+ c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
+ c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
+ c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
+ c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
+ c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
+ c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+ c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+#if ARCH_AARCH64
+ c->intra_pred[Z1_PRED] = ipred_z1_neon;
+ c->intra_pred[Z2_PRED] = ipred_z2_neon;
+ c->intra_pred[Z3_PRED] = ipred_z3_neon;
+#endif
+ c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon);
+
+ c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon);
+ c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon);
+ c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon);
+ c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon);
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+
+ c->pal_pred = BF(dav1d_pal_pred, neon);
+}
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
new file mode 100644
index 0000000000..2ecd086b3b
--- /dev/null
+++ b/third_party/dav1d/src/arm/itx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4, 4, neon);
+decl_itx16_fns( 4, 8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8, 4, neon);
+decl_itx16_fns( 8, 8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16, 4, neon);
+decl_itx16_fns(16, 8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32, 8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ if (BITDEPTH == 16 && bpc != 10) return;
+
+ assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn(R, 4, 8, neon);
+ assign_itx16_fn(R, 4, 16, neon);
+ assign_itx16_fn(R, 8, 4, neon);
+ assign_itx16_fn( , 8, 8, neon);
+ assign_itx16_fn(R, 8, 16, neon);
+ assign_itx2_fn (R, 8, 32, neon);
+ assign_itx16_fn(R, 16, 4, neon);
+ assign_itx16_fn(R, 16, 8, neon);
+ assign_itx12_fn( , 16, 16, neon);
+ assign_itx2_fn (R, 16, 32, neon);
+ assign_itx1_fn (R, 16, 64, neon);
+ assign_itx2_fn (R, 32, 8, neon);
+ assign_itx2_fn (R, 32, 16, neon);
+ assign_itx2_fn ( , 32, 32, neon);
+ assign_itx1_fn (R, 32, 64, neon);
+ assign_itx1_fn (R, 64, 16, neon);
+ assign_itx1_fn (R, 64, 32, neon);
+ assign_itx1_fn ( , 64, 64, neon);
+}
diff --git a/third_party/dav1d/src/arm/loopfilter.h b/third_party/dav1d/src/arm/loopfilter.h
new file mode 100644
index 0000000000..9ac08d94d2
--- /dev/null
+++ b/third_party/dav1d/src/arm/loopfilter.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
+}
diff --git a/third_party/dav1d/src/arm/looprestoration.h b/third_party/dav1d/src/arm/looprestoration.h
new file mode 100644
index 0000000000..1ac6d5fb5e
--- /dev/null
+++ b/third_party/dav1d/src/arm/looprestoration.h
@@ -0,0 +1,1113 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+#else
+
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[8], intptr_t w,
+ int h, enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[8], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+ ALIGN_STK_16(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+
+ // Vertical filter
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+ w, h, filter[1], edges,
+ mid_stride * sizeof(*mid)
+ HIGHBD_TAIL_SUFFIX);
+}
+#endif
+
+#if ARCH_ARM
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int w, const int h,
+ const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+ tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
+}
+
+#else
+static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
+ int32_t *tmp32 = sumsq_ptrs[0];
+ int16_t *tmp16 = sum_ptrs[0];
+ for (int i = 0; i < n - 1; i++) {
+ sumsq_ptrs[i] = sumsq_ptrs[i+1];
+ sum_ptrs[i] = sum_ptrs[i+1];
+ }
+ sumsq_ptrs[n - 1] = tmp32;
+ sum_ptrs[n - 1] = tmp16;
+}
+static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
+ int32_t *tmp32[2];
+ int16_t *tmp16[2];
+ for (int i = 0; i < 2; i++) {
+ tmp32[i] = sumsq_ptrs[i];
+ tmp16[i] = sum_ptrs[i];
+ }
+ for (int i = 0; i < 3; i++) {
+ sumsq_ptrs[i] = sumsq_ptrs[i+2];
+ sum_ptrs[i] = sum_ptrs[i+2];
+ }
+ for (int i = 0; i < 2; i++) {
+ sumsq_ptrs[3 + i] = tmp32[i];
+ sum_ptrs[3 + i] = tmp16[i];
+ }
+}
+
+static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 3);
+}
+
+static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 2);
+}
+
+static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) {
+ rotate(A_ptrs, B_ptrs, 4);
+}
+
+void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
+ int32_t *sumsq5, int16_t *sum5,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const enum LrEdgeFlags edges);
+
+void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const int w, const int s,
+ const int bitdepth_max);
+void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const int w, const int s,
+ const int bitdepth_max);
+
+void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int w1
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h,
+ const int w1 HIGHBD_DECL_SUFFIX);
+
+void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
+ const ptrdiff_t src_stride,
+ int32_t **A_ptrs,
+ int16_t **B_ptrs,
+ const int w, const int h);
+void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
+ const ptrdiff_t src_stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *sumsq_out, int16_t *sum_out,
+ const int w, int s, int bitdepth_max) {
+ // box3_v + calc_ab1
+ dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+ rotate(sumsq, sum, 3);
+}
+
+static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *sumsq_out, int16_t *sum_out,
+ const int w, int s, int bitdepth_max) {
+ // box5_v + calc_ab2
+ dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
+ rotate5_x2(sumsq, sum);
+}
+
+static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
+ int32_t *AA, int16_t *BB,
+ const pixel (*left)[4],
+ const pixel *src, const int w,
+ const int s,
+ const enum LrEdgeFlags edges,
+ const int bitdepth_max) {
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
+ sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
+}
+
+
+static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs, const int w,
+ const int w1 HIGHBD_DECL_SUFFIX) {
+ BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
+ w, w1 HIGHBD_TAIL_SUFFIX);
+ *dst += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A_ptrs, int16_t **B_ptrs,
+ const int w, const int h, const int w1
+ HIGHBD_DECL_SUFFIX) {
+ BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
+ w, h, w1 HIGHBD_TAIL_SUFFIX);
+ *dst += 2*PXSTRIDE(stride);
+ rotate_ab_2(A_ptrs, B_ptrs);
+}
+
+static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
+ int32_t **A5_ptrs, int16_t **B5_ptrs,
+ int32_t **A3_ptrs, int16_t **B3_ptrs,
+ const int w, const int h,
+ const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
+#define FILTER_OUT_STRIDE 384
+ ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,);
+ ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,);
+
+ BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
+ A5_ptrs, B5_ptrs, w, h);
+ BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
+ A3_ptrs, B3_ptrs, w, h);
+ const int16_t wt[2] = { w0, w1 };
+ BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride,
+ tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
+ *dst += h*PXSTRIDE(stride);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+}
+
+
+static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+#define BUF_STRIDE (384 + 16)
+ ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *sumsq_ptrs[3], *sumsq_rows[3];
+ int16_t *sum_ptrs[3], *sum_rows[3];
+ for (int i = 0; i < 3; i++) {
+ sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+ sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *A_ptrs[3];
+ int16_t *B_ptrs[3];
+ for (int i = 0; i < 3; i++) {
+ A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+ B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[1];
+ sumsq_ptrs[2] = sumsq_rows[2];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[1];
+ sum_ptrs[2] = sum_rows[2];
+
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ NULL, lpf, w, edges);
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+ } else {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[0];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[0];
+
+ BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq_ptrs[2] = sumsq_rows[1];
+ sum_ptrs[2] = sum_rows[1];
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+ rotate_ab_3(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq_ptrs[2] = sumsq_rows[2];
+ sum_ptrs[2] = sum_rows[2];
+ }
+
+ do {
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+ lpf_bottom += PXSTRIDE(stride);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+ sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+
+ sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ sumsq_ptrs[2] = sumsq_ptrs[1];
+ sum_ptrs[2] = sum_ptrs[1];
+ sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_3(A_ptrs, B_ptrs);
+ goto output_1;
+}
+
+static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
+ ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,);
+ int32_t *sumsq_ptrs[5], *sumsq_rows[5];
+ int16_t *sum_ptrs[5], *sum_rows[5];
+ for (int i = 0; i < 5; i++) {
+ sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
+ sum_rows[i] = &sum_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
+ ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,);
+ int32_t *A_ptrs[2];
+ int16_t *B_ptrs[2];
+ for (int i = 0; i < 2; i++) {
+ A_ptrs[i] = &A_buf[i * BUF_STRIDE];
+ B_ptrs[i] = &B_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[1];
+ sumsq_ptrs[3] = sumsq_rows[2];
+ sumsq_ptrs[4] = sumsq_rows[3];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[1];
+ sum_ptrs[3] = sum_rows[2];
+ sum_ptrs[4] = sum_rows[3];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ NULL, lpf, w, edges);
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq_ptrs[3] = sumsq_rows[4];
+ sum_ptrs[3] = sum_rows[4];
+ } else {
+ sumsq_ptrs[0] = sumsq_rows[0];
+ sumsq_ptrs[1] = sumsq_rows[0];
+ sumsq_ptrs[2] = sumsq_rows[0];
+ sumsq_ptrs[3] = sumsq_rows[0];
+ sumsq_ptrs[4] = sumsq_rows[0];
+ sum_ptrs[0] = sum_rows[0];
+ sum_ptrs[1] = sum_rows[0];
+ sum_ptrs[2] = sum_rows[0];
+ sum_ptrs[3] = sum_rows[0];
+ sum_ptrs[4] = sum_rows[0];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq_ptrs[4] = sumsq_rows[1];
+ sum_ptrs[4] = sum_rows[1];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq_ptrs[3] = sumsq_rows[2];
+ sumsq_ptrs[4] = sumsq_rows[3];
+ sum_ptrs[3] = sum_rows[2];
+ sum_ptrs[4] = sum_rows[3];
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq_ptrs[3] = sumsq_rows[4];
+ sum_ptrs[3] = sum_rows[4];
+ }
+
+ do {
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
+ NULL, lpf_bottom, w, edges);
+ lpf_bottom += PXSTRIDE(stride);
+ BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
+ NULL, lpf_bottom, w, edges);
+
+output_2:
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ // Duplicate the last row twice more
+ sumsq_ptrs[3] = sumsq_ptrs[2];
+ sumsq_ptrs[4] = sumsq_ptrs[2];
+ sum_ptrs[3] = sum_ptrs[2];
+ sum_ptrs[4] = sum_ptrs[2];
+ goto output_2;
+
+odd:
+ // Copy the last row as padding once
+ sumsq_ptrs[4] = sumsq_ptrs[3];
+ sum_ptrs[4] = sum_ptrs[3];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ // Duplicate the last row twice more
+ sumsq_ptrs[3] = sumsq_ptrs[2];
+ sumsq_ptrs[4] = sumsq_ptrs[2];
+ sum_ptrs[3] = sum_ptrs[2];
+ sum_ptrs[4] = sum_ptrs[2];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ // Output only one row
+ sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
+ w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ // Copy the last row as padding once
+ sumsq_ptrs[4] = sumsq_ptrs[3];
+ sum_ptrs[4] = sum_ptrs[3];
+
+ sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A_ptrs, B_ptrs);
+
+ goto output_1;
+}
+
+static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
+ ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,);
+ int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
+ int16_t *sum5_ptrs[5], *sum5_rows[5];
+ for (int i = 0; i < 5; i++) {
+ sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
+ sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
+ }
+ ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
+ ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,);
+ int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
+ int16_t *sum3_ptrs[3], *sum3_rows[3];
+ for (int i = 0; i < 3; i++) {
+ sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
+ sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
+ }
+
+ ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
+ ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,);
+ int32_t *A5_ptrs[2];
+ int16_t *B5_ptrs[2];
+ for (int i = 0; i < 2; i++) {
+ A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
+ B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
+ }
+ ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
+ ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,);
+ int32_t *A3_ptrs[4];
+ int16_t *B3_ptrs[4];
+ for (int i = 0; i < 4; i++) {
+ A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
+ B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
+ }
+ const pixel *src = dst;
+ const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
+
+ if (edges & LR_HAVE_TOP) {
+ sumsq5_ptrs[0] = sumsq5_rows[0];
+ sumsq5_ptrs[1] = sumsq5_rows[0];
+ sumsq5_ptrs[2] = sumsq5_rows[1];
+ sumsq5_ptrs[3] = sumsq5_rows[2];
+ sumsq5_ptrs[4] = sumsq5_rows[3];
+ sum5_ptrs[0] = sum5_rows[0];
+ sum5_ptrs[1] = sum5_rows[0];
+ sum5_ptrs[2] = sum5_rows[1];
+ sum5_ptrs[3] = sum5_rows[2];
+ sum5_ptrs[4] = sum5_rows[3];
+
+ sumsq3_ptrs[0] = sumsq3_rows[0];
+ sumsq3_ptrs[1] = sumsq3_rows[1];
+ sumsq3_ptrs[2] = sumsq3_rows[2];
+ sum3_ptrs[0] = sum3_rows[0];
+ sum3_ptrs[1] = sum3_rows[1];
+ sum3_ptrs[2] = sum3_rows[2];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+ sumsq5_rows[0], sum5_rows[0],
+ NULL, lpf, w, edges);
+ lpf += PXSTRIDE(stride);
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+ sumsq5_rows[1], sum5_rows[1],
+ NULL, lpf, w, edges);
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+ sumsq5_rows[2], sum5_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_rows[3], sum5_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq5_ptrs[3] = sumsq5_rows[4];
+ sum5_ptrs[3] = sum5_rows[4];
+ } else {
+ sumsq5_ptrs[0] = sumsq5_rows[0];
+ sumsq5_ptrs[1] = sumsq5_rows[0];
+ sumsq5_ptrs[2] = sumsq5_rows[0];
+ sumsq5_ptrs[3] = sumsq5_rows[0];
+ sumsq5_ptrs[4] = sumsq5_rows[0];
+ sum5_ptrs[0] = sum5_rows[0];
+ sum5_ptrs[1] = sum5_rows[0];
+ sum5_ptrs[2] = sum5_rows[0];
+ sum5_ptrs[3] = sum5_rows[0];
+ sum5_ptrs[4] = sum5_rows[0];
+
+ sumsq3_ptrs[0] = sumsq3_rows[0];
+ sumsq3_ptrs[1] = sumsq3_rows[0];
+ sumsq3_ptrs[2] = sumsq3_rows[0];
+ sum3_ptrs[0] = sum3_rows[0];
+ sum3_ptrs[1] = sum3_rows[0];
+ sum3_ptrs[2] = sum3_rows[0];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
+ sumsq5_rows[0], sum5_rows[0],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_1;
+
+ sumsq5_ptrs[4] = sumsq5_rows[1];
+ sum5_ptrs[4] = sum5_rows[1];
+
+ sumsq3_ptrs[2] = sumsq3_rows[1];
+ sum3_ptrs[2] = sum3_rows[1];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
+ sumsq5_rows[1], sum5_rows[1],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ sumsq5_ptrs[3] = sumsq5_rows[2];
+ sumsq5_ptrs[4] = sumsq5_rows[3];
+ sum5_ptrs[3] = sum5_rows[2];
+ sum5_ptrs[4] = sum5_rows[3];
+
+ sumsq3_ptrs[2] = sumsq3_rows[2];
+ sum3_ptrs[2] = sum3_rows[2];
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
+ sumsq5_rows[2], sum5_rows[2],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_rows[3], sum5_rows[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+
+ if (--h <= 0)
+ goto vert_2;
+
+ // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
+ // one of them to point at the previously unused rows[4].
+ sumsq5_ptrs[3] = sumsq5_rows[4];
+ sum5_ptrs[3] = sum5_rows[4];
+ }
+
+ do {
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[3], sum5_ptrs[3],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ if (--h <= 0)
+ goto odd;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[4], sum5_ptrs[4],
+ left, src, w, edges);
+ left++;
+ src += PXSTRIDE(stride);
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ } while (--h > 0);
+
+ if (!(edges & LR_HAVE_BOTTOM))
+ goto vert_2;
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[3], sum5_ptrs[3],
+ NULL, lpf_bottom, w, edges);
+ lpf_bottom += PXSTRIDE(stride);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
+ sumsq5_ptrs[4], sum5_ptrs[4],
+ NULL, lpf_bottom, w, edges);
+
+output_2:
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_2:
+ // Duplicate the last row twice more
+ sumsq5_ptrs[3] = sumsq5_ptrs[2];
+ sumsq5_ptrs[4] = sumsq5_ptrs[2];
+ sum5_ptrs[3] = sum5_ptrs[2];
+ sum5_ptrs[4] = sum5_ptrs[2];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ goto output_2;
+
+odd:
+ // Copy the last row as padding once
+ sumsq5_ptrs[4] = sumsq5_ptrs[3];
+ sum5_ptrs[4] = sum5_ptrs[3];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 2, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+
+output_1:
+ // Duplicate the last row twice more
+ sumsq5_ptrs[3] = sumsq5_ptrs[2];
+ sumsq5_ptrs[4] = sumsq5_ptrs[2];
+ sum5_ptrs[3] = sum5_ptrs[2];
+ sum5_ptrs[4] = sum5_ptrs[2];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+ // Output only one row
+ sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
+ w, 1, params->sgr.w0, params->sgr.w1
+ HIGHBD_TAIL_SUFFIX);
+ return;
+
+vert_1:
+ // Copy the last row as padding once
+ sumsq5_ptrs[4] = sumsq5_ptrs[3];
+ sum5_ptrs[4] = sum5_ptrs[3];
+
+ sumsq3_ptrs[2] = sumsq3_ptrs[1];
+ sum3_ptrs[2] = sum3_ptrs[1];
+
+ sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
+ w, params->sgr.s0, BITDEPTH_MAX);
+ rotate_ab_2(A5_ptrs, B5_ptrs);
+ sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
+ w, params->sgr.s1, BITDEPTH_MAX);
+ rotate_ab_4(A3_ptrs, B3_ptrs);
+
+ goto output_1;
+}
+
+#endif
+
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if ARCH_AARCH64
+ c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+ c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = sgr_filter_5x5_neon;
+ c->sgr[1] = sgr_filter_3x3_neon;
+ c->sgr[2] = sgr_filter_mix_neon;
+ }
+}
diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h
new file mode 100644
index 0000000000..06cd533a9b
--- /dev/null
+++ b/third_party/dav1d/src/arm/mc.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+
+ c->avg = BF(dav1d_avg, neon);
+ c->w_avg = BF(dav1d_w_avg, neon);
+ c->mask = BF(dav1d_mask, neon);
+ c->blend = BF(dav1d_blend, neon);
+ c->blend_h = BF(dav1d_blend_h, neon);
+ c->blend_v = BF(dav1d_blend_v, neon);
+ c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+ c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+ c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+ c->emu_edge = BF(dav1d_emu_edge, neon);
+}
diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h
new file mode 100644
index 0000000000..9db0bf86ae
--- /dev/null
+++ b/third_party/dav1d/src/arm/msac.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64 || defined(__ARM_NEON)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
diff --git a/third_party/dav1d/src/arm/refmvs.h b/third_party/dav1d/src/arm/refmvs.h
new file mode 100644
index 0000000000..1c2dc704cf
--- /dev/null
+++ b/third_party/dav1d/src/arm/refmvs.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_save_tmvs_fn(dav1d_save_tmvs_neon);
+decl_splat_mv_fn(dav1d_splat_mv_neon);
+
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_neon;
+ c->splat_mv = dav1d_splat_mv_neon;
+}