summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/arm/32/filmgrain.S
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/arm/32/filmgrain.S')
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain.S2039
1 files changed, 2039 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000..d1f83efb98
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r5, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r6, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r7, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r8, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r9, q0
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5, \r6, \r7}, [r0]!
+ vst1.16 {\r8, \r9}, [r0]!
+ vst1.16 {\r10[0]}, [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ read_rand r12, 11, 0
+ vld1.16 {d0[2]}, [r11]
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5}, [r0]
+ add r0, r0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 d0, q0
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mov lr, #-128
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #1
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.8 d1[7], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ vmull.s8 q2, d6, d28
+ vmull.s8 q3, d7, d28
+ vmull.s8 q4, d0, d27
+ vmull.s8 q5, d1, d27
+
+ vaddl.s16 q0, d4, d8
+ vaddl.s16 q2, d5, d9
+ vaddl.s16 q4, d6, d10
+ vaddl.s16 q5, d7, d11
+
+ vmull.s8 q3, d3, d29
+ vmull.s8 q1, d2, d29
+
+ vaddw.s16 q4, q4, d6
+ vaddw.s16 q5, q5, d7
+ vaddw.s16 q3, q2, d3
+ vaddw.s16 q2, q0, d2
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vpaddl.s8 q6, q6
+ vpaddl.s8 q7, q7
+ vadd.i16 q0, q0, q6
+ vadd.i16 q1, q1, q7
+ vpop {q6-q7}
+ vrshrn.s16 d0, q0, #2
+ vrshrn.s16 d1, q1, #2
+.endif
+.ifc \type, uv_422
+ vld1.8 {q0, q1}, [r11]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vrshrn.s16 d0, q0, #1
+ vrshrn.s16 d1, q1, #1
+.endif
+.ifc \type, uv_444
+ vld1.8 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+.endif
+ vmull.s8 q1, d0, d13
+ vmull.s8 q0, d1, d13
+ vaddw.s16 q2, q2, d2
+ vaddw.s16 q3, q3, d3
+ vaddw.s16 q4, q4, d0
+ vaddw.s16 q5, q5, d1
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vmovn.i16 d1, q0
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s8 r10, d1[5]
+.endif
+.ifnc \lag, lag1
+ vmov.s8 r8, d1[6]
+.endif
+ vmov.s8 r6, d1[7]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q4
+.if \elems == 9
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vmovn.i16 d2, q1
+ vext.8 q0, q0, q1, #7
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q5
+
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r11, 11
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #1
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ vst1.8 {q0}, [r0]!
+.endif
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ vmov q3, \mid
+ vext.8 q0, \left, \mid, #15
+ vext.8 q1, \mid, \right, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ vmov \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH - 16
+ sub lr, r0, #1*GRAIN_WIDTH - 16
+ vld1.8 {q10}, [r12] // load top right
+ vld1.8 {q13}, [lr]
+
+ vext.8 q6, q8, q9, #14 // top left, top mid
+ vdup.8 d14, d28[0]
+ vext.8 q8, q8, q9, #15
+ vdup.8 d15, d28[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d16
+ vaddl.s16 q5, d3, d17
+
+ vext.8 q6, q9, q10, #1 // top mid, top right
+ vdup.8 d14, d28[3]
+ vext.8 q8, q9, q10, #2
+ vdup.8 d15, d28[4]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q11, q12, #14 // top left, top mid
+ vdup.8 d14, d28[5]
+ vext.8 q8, q11, q12, #15
+ vdup.8 d15, d28[6]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q12, q13, #1 // top mid, top right
+ vdup.8 d14, d29[0]
+ vext.8 q8, q12, q13, #2
+ vdup.8 d15, d29[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vdup.8 d14, d28[2]
+ vdup.8 d15, d28[7]
+
+ vmull.s8 q0, d18, d14
+ vmull.s8 q1, d19, d14
+ vmull.s8 q6, d24, d15
+ vmull.s8 q8, d25, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH
+ sub lr, r0, #1*GRAIN_WIDTH
+ vld1.8 {q9}, [r12] // load the previous block right above
+ vld1.8 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #13
+ vext.8 q11, q11, q11, #13
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ sub r12, r0, #3*GRAIN_WIDTH + 3
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d20, d26[0]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d26[1]
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vext.8 q8, q11, q12, #2
+ vdup.8 d20, d26[2]
+ vext.8 q9, q11, q12, #3
+ vdup.8 d21, d26[3]
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d14
+ vaddl.s16 q5, d3, d15
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #4
+ vdup.8 d20, d26[4]
+ vext.8 q7, q11, q12, #5
+ vdup.8 d21, d26[5]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ sub r12, r0, #2*GRAIN_WIDTH + 3
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #6
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d20, d26[6]
+ vdup.8 d21, d26[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d22, d21
+ vmull.s8 q7, d23, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #1
+ vdup.8 d20, d27[0]
+ vext.8 q7, q11, q12, #2
+ vdup.8 d21, d27[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #3
+ vdup.8 d20, d27[2]
+ vext.8 q9, q11, q12, #4
+ vdup.8 d21, d27[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ sub r12, r0, #1*GRAIN_WIDTH + 3
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #5
+ vdup.8 d20, d27[4]
+ vext.8 q7, q11, q12, #6
+ vdup.8 d21, d27[5]
+
+ vld1.8 {q11, q12}, [r12]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vdup.8 d20, d27[6]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d27[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #2
+ vdup.8 d20, d28[0]
+ vext.8 q7, q11, q12, #3
+ vdup.8 d21, d28[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #4
+ vdup.8 d20, d28[2]
+ vext.8 q9, q11, q12, #5
+ vdup.8 d21, d28[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #6
+ vdup.8 d20, d28[4]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+
+ vaddw.s16 q2, q2, d0
+ vaddw.s16 q3, q3, d1
+ vaddw.s16 q4, q4, d2
+ vaddw.s16 q5, q5, d3
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ push {r11,lr}
+1:
+ get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ subs r1, r1, #1
+ store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r11,lr}
+1:
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ subs r1, r1, #1
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.8 {q3}, [r11]!
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q8, q0, q15
+ bl get_gaussian_neon
+ vrshl.s16 q9, q0, q15
+ vqmovn.s16 d0, q8
+ vqmovn.s16 d1, q9
+
+ vand q3, q3, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ vst1.8 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+ push {r11,lr}
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ pop {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vld1.16 {q4, q5}, [r12]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vpaddl.s8 q4, q4
+ vpaddl.s8 q5, q5
+ vadd.i16 q2, q2, q4
+ vadd.i16 q3, q3, q5
+ vrshrn.s16 d4, q2, #2
+ vrshrn.s16 d5, q3, #2
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vrshrn.s16 d4, q2, #1
+ vrshrn.s16 d5, q3, #1
+
+add_coeff_lag0_start:
+ vand q3, q2, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ bx lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH
+ mov r1, r2
+ mul r12, r12, lr
+.endif
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #1
+ vneg.s16 q12, q12
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 16
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 64
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #2
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q9, q9, q10, q11
+ sum_\type\()_lag1 q10, q10, q11, q12
+ sum_\type\()_lag1 q12, q11, q12, q13, right
+ get_grain_2 d26
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+ vmov q11, q10
+ vmov q10, q9
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH-3
+ mov r1, r2
+ mul r12, r12, lr
+
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #7
+ vneg.s16 q12, q12
+
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add r12, r11, #GRAIN_WIDTH
+.endif
+ vmov q1, q13
+ vmov q0, q8
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, #255
+ vmov q0, q9
+ vmov q8, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, q14
+ vmov q0, q10
+ vmov q9, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov q10, q2
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q10, q9, q10, q11, right
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d14, d15, d16, d17, d20, d21
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ vmov.u8 r11, \src1[0+\off]
+ vmov.u8 r12, \src2[0+\off]
+ add r11, r11, r3
+ vmov.u8 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u8 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u8 r12, \src1[4+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u8 lr, \src2[4+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u8 r11, \src1[6+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u8 r12, \src2[6+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst3, \src1, \src3, 0
+ gather_interleaved \dst1, \dst3, \src1, \src3, 1
+ gather_interleaved \dst2, \dst4, \src2, \src4, 0
+ gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, 0
+ gather_interleaved d8, d9, d0, d1, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH // grain_lut stride
+
+ neg r4, r4
+ vdup.16 q13, r4 // -scaling_shift
+ cmp r8, #0
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i8 q14, #16
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #9 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.8 d14, d24[0]
+ vdup.8 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q2, q3}, [r6], r9 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r8], r9 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+ vld1.8 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d4, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d4, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d15
+ vmull.s8 q5, d21, d15
+ vmull.s8 q8, d22, d15
+ vmull.s8 q9, d23, d15
+ vmlal.s8 q4, d4, d14
+ vmlal.s8 q5, d5, d14
+ vmlal.s8 q8, d6, d14
+ vmlal.s8 q9, d7, d14
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q8, #5
+ vqrshrn.s16 d23, q9, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+
+ bl gather32_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q2, d8 // scaling
+ vmovl.u8 q3, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q2 // scaling * grain
+ vmul.i16 q9, q9, q3
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r7, r7, #1
+.if \oy
+ vdup.8 d14, d25[0]
+ vdup.8 d15, d25[1]
+.endif
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+ ldrd r8, r9, [sp, #116] // offsets, h
+ ldrd r10, r11, [sp, #124] // uv, is_id
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ add r10, r10, #FGD_UV_OFFSET
+ vld1.16 {d4[]}, [r12] // uv_luma_mult
+ vld1.16 {d4[2]}, [r10] // uv_offset
+ vld1.16 {d4[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg lr, lr // -scaling_shift
+
+ cmp r12, #0
+ vdup.16 q13, lr // -scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ vmov.i8 q14, #16
+ vmov.i8 q15, #240
+ beq 2f
+ // is_id
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ mov r10, #GRAIN_WIDTH // grain_lut stride
+
+ add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+
+.if \sy
+ vmov.i8 d6, #23
+ vmov.i8 d7, #22
+.else
+ vmov.i8 d6, #27
+ vmov.i8 d7, #17
+.endif
+
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8, q9}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmull.s8 q6, d22, d7
+ vmull.s8 q7, d23, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vmlal.s8 q6, d18, d6
+ vmlal.s8 q7, d19, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q6, #5
+ vqrshrn.s16 d23, q7, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if !\csfl
+ vld1.8 {q8, q9}, [r1, :128] // src
+ vmovl.u8 q4, d0
+ vmovl.u8 q5, d1
+ vmovl.u8 q6, d2
+ vmovl.u8 q7, d3
+ vmovl.u8 q0, d16
+ vmovl.u8 q1, d17
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q6, q6, d4[0]
+ vmul.i16 q7, q7, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vmul.i16 q8, q8, d4[1]
+ vmul.i16 q9, q9, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vqadd.s16 q6, q6, q8
+ vqadd.s16 q7, q7, q9
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vshr.s16 q6, q6, #6
+ vshr.s16 q7, q7, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vadd.i16 q6, q6, q0
+ vadd.i16 q7, q7, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6
+ vqmovun.s16 d3, q7
+.endif
+
+ bl gather32_neon
+
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r9, r9, #1
+.if \oy
+ vdup.8 d6, d25[0]
+ vdup.8 d7, d25[1]
+.endif
+
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10}, [r5], r10 // grain_lut
+ vld1.8 {q11}, [r1, :128], r2 // src
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if \csfl
+ vrshrn.u16 d0, q0, #1
+ vrshrn.u16 d1, q1, #1
+.else
+ vrshr.u16 q4, q0, #1
+ vrshr.u16 q5, q1, #1
+ vmovl.u8 q0, d22
+ vmovl.u8 q1, d23
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+.endif
+
+ bl gather16_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+
+ vaddw.u8 q8, q8, d22 // *src + noise
+ vaddw.u8 q9, q9, d23
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+
+ vmax.u8 q0, q0, q14
+ vmin.u8 q0, q0, q15
+
+ subs r9, r9, #1
+.if \oy
+ vswp d6, d7
+.endif
+ vst1.8 {q0}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc