summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/32/looprestoration16.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/32/looprestoration16.S')
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration16.S801
1 files changed, 801 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S
new file mode 100644
index 0000000000..d699617a87
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ ldr r8, [sp, #116] // bitdepth_max
+ vld1.16 {q0}, [r4, :128]
+ clz r8, r8
+ vmov.i32 q14, #1
+ sub r9, r8, #38 // -(bitdepth + 6)
+ sub r8, r8, #25 // -round_bits_h
+ neg r9, r9 // bitdepth + 6
+ vdup.32 q1, r9
+ vdup.32 q13, r8 // -round_bits_h
+ vmov.i16 q15, #8192
+ vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
+ mov r8, r5
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #6
+ sub lr, lr, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #6
+
+
+1: // Loop vertically
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q4, q5}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d3}, [r1]!
+ // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vld1.16 {d13}, [r1]!
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+ // and shift q2/q3 to have 3x the first pixel at the front.
+ vdup.16 q1, d4[0]
+ vdup.16 q6, d8[0]
+ // Move r2 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ lsl r9, r9, #1
+ ldrh r11, [r2, r9]
+ ldrh r9, [lr, r9]
+ // Fill q11/q12 with the right padding pixel
+ vdup.16 q11, r11
+ vdup.16 q12, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q9, q10}, [r4]
+
+ vbit q2, q11, q9
+ vbit q3, q11, q10
+ vbit q4, q12, q9
+ vbit q5, q12, q10
+
+4: // Loop horizontally
+ vext.8 q7, q2, q3, #4
+ vext.8 q8, q2, q3, #8
+ vext.8 q6, q2, q3, #2
+ vext.8 q9, q2, q3, #10
+ vadd.i16 q8, q8, q7
+ vadd.i16 q9, q9, q6
+ vext.8 q6, q2, q3, #12
+ vext.8 q7, q2, q3, #6
+ vadd.i16 q2, q2, q6
+ vmull.s16 q6, d14, d0[3]
+ vmlal.s16 q6, d16, d1[0]
+ vmlal.s16 q6, d18, d1[1]
+ vmlal.s16 q6, d4, d1[2]
+ vmull.s16 q7, d15, d0[3]
+ vmlal.s16 q7, d17, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d5, d1[2]
+
+ vext.8 q8, q4, q5, #4
+ vext.8 q10, q4, q5, #8
+ vext.8 q9, q4, q5, #2
+ vext.8 q2, q4, q5, #10
+ vadd.i16 q10, q10, q8
+ vadd.i16 q2, q2, q9
+ vext.8 q8, q4, q5, #12
+ vext.8 q9, q4, q5, #6
+ vadd.i16 q4, q4, q8
+ vmull.s16 q8, d18, d0[3]
+ vmlal.s16 q8, d20, d1[0]
+ vmlal.s16 q8, d4, d1[1]
+ vmlal.s16 q8, d8, d1[2]
+ vmull.s16 q9, d19, d0[3]
+ vmlal.s16 q9, d21, d1[0]
+ vmlal.s16 q9, d5, d1[1]
+ vmlal.s16 q9, d9, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vadd.i32 q8, q8, q14
+ vadd.i32 q9, q9, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vrshl.s32 q8, q8, q13
+ vrshl.s32 q9, q9, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vqmovun.s32 d14, q8
+ vqmovun.s32 d15, q9
+ vmin.u16 q6, q6, q10
+ vmin.u16 q7, q7, q10
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ subs r5, r5, #8
+ vst1.16 {q6}, [r0, :128]!
+ vst1.16 {q7}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q2, q3
+ vmov q4, q5
+ vld1.16 {q3}, [r2]!
+ vld1.16 {q5}, [lr]!
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q5}
+ ldrd r4, r5, [sp, #52]
+ ldrd r6, r7, [sp, #60]
+ ldr lr, [sp, #68] // bitdepth_max
+ vld1.16 {q0}, [r5, :128]
+ vdup.16 q5, lr
+ clz lr, lr
+ sub lr, lr, #11 // round_bits_v
+ vdup.32 q4, lr
+ mov lr, r4
+ vneg.s32 q4, q4 // -round_bits_v
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vrshl.s32 q2, q2, q4 // round_bits_v
+ vrshl.s32 q3, q3, q4
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q5 // bitdepth_max
+ vst1.16 {q2}, [r0, :128], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-q15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #16
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q5}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #4
+ sub r12, r12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #4
+
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #6
+ sub r12, r12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #6
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 3x the first pixel at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -2
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ vext.8 q8, q0, q1, #6
+ vext.8 q10, q4, q5, #6
+ vext.8 q9, q0, q1, #8
+ vext.8 q11, q4, q5, #8
+ vadd.i16 q2, q2, q8
+ vadd.i16 q3, q3, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d1, d1
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d9, d9
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 16