diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
commit | 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d (patch) | |
tree | a31f07c9bcca9d56ce61e9a1ffd30ef350d513aa /third_party/dav1d/src/arm/64/filmgrain16.S | |
parent | Initial commit. (diff) | |
download | firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.tar.xz firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.zip |
Adding upstream version 115.8.0esr.upstream/115.8.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/arm/64/filmgrain16.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/filmgrain16.S | 1997 |
1 files changed, 1997 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S new file mode 100644 index 0000000000..7c4ff6dda9 --- /dev/null +++ b/third_party/dav1d/src/arm/64/filmgrain16.S @@ -0,0 +1,1997 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +function get_grain_4_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #2 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.h[7], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub x12, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + + ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid + ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right + + smull v4.4s, v17.4h, v28.4h + smlal v4.4s, v0.4h, v27.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v17.8h, v28.8h + smlal2 v5.4s, v0.8h, v27.8h + smlal2 v5.4s, v1.8h, v29.8h + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff + bl sum_\lag\()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH*2 + ld1 {v22.8h, v23.8h}, [x19], #32 + ld1 {v24.8h, v25.8h}, [x12] + addp v22.8h, v22.8h, v23.8h + addp v23.8h, v24.8h, v25.8h + add v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.8h, v23.8h}, [x19], #32 + addp v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.8h}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.8b, \uv_coeff + sxtl v1.8h, v1.8b + smlal v4.4s, v0.4h, v1.4h + smlal2 v5.4s, v0.8h, v1.8h +.else + smlal v4.4s, v0.4h, v30.4h + smlal2 v5.4s, v0.8h, v30.8h +.endif +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.h[5] +.endif +.ifnc \lag, lag1 + smov w16, v0.h[6] +.endif + smov w14, v0.h[7] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b +.ifc \edge, right + mov w15, #3 + bl output_\lag\()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #2 +.else + mov w15, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #14 +.endif + st1 {v0.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + ld1 {v21.8h}, [x13] + + dup v26.8b, v30.b[0] + ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid + dup v27.8b, v30.b[1] + ext v23.16b, v16.16b, v17.16b, #14 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[3] + ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.8b, v30.b[4] + ext v1.16b, v17.16b, v18.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smull v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[5] + ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid + dup v27.16b, v30.b[6] + ext v23.16b, v19.16b, v20.16b, #14 + sxtl v26.8h, v26.8b + dup v28.16b, v30.b[8] + ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.16b, v30.b[9] + ext v1.16b, v20.16b, v21.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smlal v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smlal2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + sxtl v26.8h, v26.8b + sxtl v27.8h, v27.8b + + smlal v4.4s, v17.4h, v26.4h + smlal v4.4s, v20.4h, v27.4h + smlal2 v5.4s, v17.8h, v26.8h + smlal2 v5.4s, v20.8h, v27.8h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH*2 - 16 + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v15.8h}, [x11] // load top right + ld1 {v18.8h}, [x12] + ld1 {v21.8h}, [x13] + + dup v22.8b, v29.b[0] + ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid + dup v23.8b, v29.b[1] + ext v9.16b, v13.16b, v14.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[2] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[3] + ext v10.16b, v13.16b, v14.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[4] + ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[5] + ext v12.16b, v14.16b, v15.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[6] + ext v13.16b, v14.16b, v15.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smull v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v14.4h, v25.4h + smull2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v14.8h, v25.8h + + dup v22.8b, v29.b[7] + ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid + dup v23.8b, v29.b[8] + ext v9.16b, v16.16b, v17.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[9] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[10] + ext v10.16b, v16.16b, v17.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[11] + ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[12] + ext v12.16b, v17.16b, v18.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[13] + ext v13.16b, v17.16b, v18.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v17.4h, v25.4h + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v17.8h, v25.8h + + dup v22.8b, v29.b[14] + ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid + dup v23.8b, v29.b[15] + ext v9.16b, v19.16b, v20.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v30.b[0] + sxtl v23.8h, v23.8b + dup v25.8b, v30.b[1] + ext v10.16b, v19.16b, v20.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v30.b[2] + ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v30.b[3] + ext v12.16b, v20.16b, v21.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[4] + ext v13.16b, v20.16b, v21.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v20.4h, v25.4h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v20.8h, v25.8h + + mov v13.16b, v14.16b + mov v14.16b, v15.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH*2 + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v14.8h}, [x11] // load the previous block right above + ld1 {v17.8h}, [x12] + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #80 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_2 v0 + subs w1, w1, #1 + st1 {v0.s}[0], [x0], #4 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #40 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_4 v0 + subs w1, w1, #1 + st1 {v0.4h}, [x0] + add x0, x0, #GRAIN_WIDTH*2-80 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_444_lag0_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v4.8h}, [x19], #16 +gen_grain_uv_lag0_8_start: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h +gen_grain_uv_lag0_8_add: + and v4.16b, v4.16b, v1.16b + smull v2.4s, v4.4h, v27.4h + smull2 v3.4s, v4.8h, v27.8h + srshl v2.4s, v2.4s, v28.4s + srshl v3.4s, v3.4s, v28.4s + sqxtn v2.4h, v2.4s + sqxtn2 v2.8h, v3.4s + sqadd v2.8h, v2.8h, v0.8h + smin v2.8h, v2.8h, v25.8h + smax v2.8h, v2.8h, v26.8h + st1 {v2.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_420_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + add x12, x19, #GRAIN_WIDTH*2 + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + ld1 {v18.8h, v19.8h}, [x12] + addp v16.8h, v16.8h, v17.8h + addp v17.8h, v18.8h, v19.8h + add v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + addp v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add x12, x19, #GRAIN_WIDTH*2 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + ld1 {v18.4h, v19.4h}, [x12] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + addp v17.4h, v18.4h, v19.4h + add v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #2 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #1 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 +.else + clz w15, w2 +.endif + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #2 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + mov v1.16b, v29.16b + bl gen_grain_uv_444_lag0_neon // 8 + movi v1.16b, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + mov v1.16b, v30.16b + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 + add x19, x19, #4 + st1 {v16.s}[0], [x0], #4 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.8b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b +.ifc \type, uv_444 + sxtl v30.8h, v30.8b +.endif + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #(3*GRAIN_WIDTH-3)*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 + + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #14 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + mov v1.16b, v29.16b + bl gen_grain_\type\()_lag0_8_neon // 8 + movi v1.16b, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + mov v1.16b, v30.16b + bl gen_grain_\type\()_lag0_4_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + sxtl v30.8h, v30.8b + set_height w1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0] + umov w15, \src2[1] + umov w16, \src1[2] + add x14, x14, x3 + umov w17, \src2[3] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4] + add x16, x16, x3 + ld1 {\dst2}[1+\off], [x15] + umov w15, \src2[5] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6] + add x14, x14, x3 + ld1 {\dst2}[3+\off], [x17] + umov w17, \src2[7] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[5+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[7+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2, src3, src4 + gather_interleaved \dst1, \dst2, \src1, \src3, 0 + gather_interleaved \dst2, \dst1, \src3, \src1, 0 + gather_interleaved \dst1, \dst2, \src2, \src4, 8 + gather_interleaved \dst2, \dst1, \src4, \src2, 8 +.endm + +function gather32_neon + gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h + ret +endfunc + +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w10, [sp, #96] // bitdepth_max + ldr w6, [x6] // offsets[0][0] + dup v26.8h, w10 // bitdepth_max + clz w10, w10 + ldr w8, [sp, #80] // clip + sub w10, w10, #24 // -bitdepth_min_8 + mov x9, #GRAIN_WIDTH*2 // grain_lut stride + neg w10, w10 // bitdepth_min_8 + + dup v29.8h, w4 // 15 - scaling_shift + dup v27.8h, w10 // bitdepth_min_8 + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #235 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v26.16b // bitdepth_max +2: + + ushr v26.8h, v26.8h, #1 // grain_max + not v25.16b, v26.16b // grain_min + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + + add x5, x5, #18 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #88] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v8.8h, v27.h[0] + dup v9.8h, v27.h[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src +.if \ox + ld1 {v20.4h}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v14.4h}, [x8], x9 // grain_lut top old +.endif + mvni v4.8h, #0xf0, lsl #8 // 0x0fff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + and v2.16b, v2.16b, v4.16b + and v3.16b, v3.16b, v4.16b + bl gather32_neon + +.if \ox + smull v20.4s, v20.4h, v27.4h + smlal v20.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v14.4s, v14.4h, v27.4h + smlal v14.4s, v21.4h, v28.4h + sqrshrn v20.4h, v20.4s, #5 + sqrshrn v14.4h, v14.4s, #5 + smin v20.4h, v20.4h, v26.4h + smin v14.4h, v14.4h, v26.4h + smax v20.4h, v20.4h, v25.4h + smax v14.4h, v14.4h, v25.4h +.endif + +.if \ox + smull v10.4s, v20.4h, v9.4h +.else + smull v10.4s, v16.4h, v9.4h +.endif + smull2 v11.4s, v16.8h, v9.8h + smull v12.4s, v17.4h, v9.4h + smull2 v13.4s, v17.8h, v9.8h + smull v16.4s, v18.4h, v9.4h + smull2 v17.4s, v18.8h, v9.8h + smull v18.4s, v19.4h, v9.4h + smull2 v19.4s, v19.8h, v9.8h +.if \ox + smlal v10.4s, v14.4h, v8.4h +.else + smlal v10.4s, v21.4h, v8.4h +.endif + smlal2 v11.4s, v21.8h, v8.8h + smlal v12.4s, v22.4h, v8.4h + smlal2 v13.4s, v22.8h, v8.8h + smlal v16.4s, v23.4h, v8.4h + smlal2 v17.4s, v23.8h, v8.8h + smlal v18.4s, v24.4h, v8.4h + smlal2 v19.4s, v24.8h, v8.8h + sqrshrn v10.4h, v10.4s, #5 + sqrshrn2 v10.8h, v11.4s, #5 + sqrshrn v11.4h, v12.4s, #5 + sqrshrn2 v11.8h, v13.4s, #5 + sqrshrn v12.4h, v16.4s, #5 + sqrshrn2 v12.8h, v17.4s, #5 + sqrshrn v13.4h, v18.4s, #5 + sqrshrn2 v13.8h, v19.4s, #5 + smin v16.8h, v10.8h, v26.8h + smin v17.8h, v11.8h, v26.8h + smin v18.8h, v12.8h, v26.8h + smin v19.8h, v13.8h, v26.8h + smax v16.8h, v16.8h, v25.8h + smax v17.8h, v17.8h, v25.8h + smax v18.8h, v18.8h, v25.8h + smax v19.8h, v19.8h, v25.8h +.endif + + uxtl v4.8h, v6.8b // scaling +.if \ox && !\oy + sqrshrn v20.4h, v20.4s, #5 +.endif + uxtl2 v5.8h, v6.16b +.if \ox && !\oy + smin v20.4h, v20.4h, v26.4h +.endif + uxtl v6.8h, v7.8b +.if \ox && !\oy + smax v20.4h, v20.4h, v25.4h +.endif + uxtl2 v7.8h, v7.16b +.if \ox && !\oy + ins v16.d[0], v20.d[0] +.endif + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h + + usqadd v0.8h, v20.8h // *src + noise + usqadd v1.8h, v21.8h + usqadd v2.8h, v22.8h + usqadd v3.8h, v23.8h + + umax v0.8h, v0.8h, v30.8h + umax v1.8h, v1.8h, v30.8h + umax v2.8h, v2.8h, v30.8h + umax v3.8h, v3.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w7, w7, #1 +.if \oy + dup v8.8h, v28.h[0] + dup v9.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr d14, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc |