/* * Copyright © 2021, VideoLAN and dav1d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr w11, w2, #3 lsr w12, w2, #12 lsr w13, w2, #1 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) eor w12, w12, w13 // (r >> 12) ^ (r >> 1) eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr w2, w2, #\steps .endif and w11, w11, #((1 << \steps) - 1) // bit .if \shift orr w2, w2, w11, lsl #(16 - \steps) // *state .else orr w2, w2, w11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, x2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, x2, #17 - \bits, #\bits lsr w2, w2, #1 .endm // special calling convention: // w2 holds seed // x3 holds dav1d_gaussian_sequence // clobbers x11-x15 // returns in v0.8h function get_gaussian_neon increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 0 increment_seed 4 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] read_rand x14, 11, 3 ld1 {v0.h}[3], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 2 ld1 {v0.h}[4], [x14] add x15, x3, x15, lsl #1 read_rand x14, 11, 1 ld1 {v0.h}[5], [x15] read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[6], [x14] ld1 {v0.h}[7], [x15] ret endfunc .macro store_grain_row r0, r1, r2, r3, r4, r5 st1 {\r0\().16b,\r1\().16b}, [x0], #32 st1 {\r2\().16b,\r3\().16b}, [x0], #32 st1 {\r4\().16b}, [x0], #16 st1 {\r5\().h}[0], [x0], #2 .endm function get_grain_2_neon increment_seed 2 read_rand x14, 11, 1 read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] ld1 {v0.h}[1], [x15] srshl v0.4h, v0.4h, v31.4h ret endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, v0 mov \dst\().8b, v0.8b .endif .endm function get_grain_4_neon increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 0 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] ld1 {v0.h}[3], [x15] srshl v0.4h, v0.4h, v31.4h ret endfunc .macro get_grain_4 dst bl get_grain_4_neon .ifnc \dst, v0 mov \dst\().8b, v0.8b .endif .endm // w15 holds the number of entries to produce // w14, w16 and w17 hold the previous output entries // v0 holds the vector of produced entries // v1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon 1: read_shift_rand x13, 11 mov w11, v1.s[0] ldrsh w12, [x3, x13, lsl #1] ext v0.16b, v0.16b, v0.16b, #2 .if \n == 1 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output .elseif \n == 2 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w14, w17, w11 // += *coeff * prev output 2 mov w16, w14 .else madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 madd w11, w14, w21, w11 // += *coeff * prev output 3 mov w17, w16 mov w16, w14 .endif add w14, w11, w8 // 1 << (ar_coeff_shift - 1) add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr w14, w14, w7 // >> ar_coeff_shift asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add w14, w14, w12 cmp w14, w5 csel w14, w14, w5, le cmp w14, w6 csel w14, w14, w6, ge subs w15, w15, #1 ext v1.16b, v1.16b, v1.16b, #4 ins v0.h[7], w14 b.gt 1b ret endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon sub x12, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v18.8h}, [x12] // load top right ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right smull v4.4s, v17.4h, v28.4h smlal v4.4s, v0.4h, v27.4h smlal v4.4s, v1.4h, v29.4h smull2 v5.4s, v17.8h, v28.8h smlal2 v5.4s, v0.8h, v27.8h smlal2 v5.4s, v1.8h, v29.8h mov v16.16b, v17.16b mov v17.16b, v18.16b ret endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff bl sum_\lag\()_above_neon .ifc \type, uv_420 add x12, x19, #GRAIN_WIDTH*2 ld1 {v22.8h, v23.8h}, [x19], #32 ld1 {v24.8h, v25.8h}, [x12] addp v22.8h, v22.8h, v23.8h addp v23.8h, v24.8h, v25.8h add v22.8h, v22.8h, v23.8h srshr v0.8h, v22.8h, #2 .endif .ifc \type, uv_422 ld1 {v22.8h, v23.8h}, [x19], #32 addp v22.8h, v22.8h, v23.8h srshr v0.8h, v22.8h, #1 .endif .ifc \type, uv_444 ld1 {v0.8h}, [x19], #16 .endif .if \uv_layout .ifnb \uv_coeff dup v1.8b, \uv_coeff sxtl v1.8h, v1.8b smlal v4.4s, v0.4h, v1.4h smlal2 v5.4s, v0.8h, v1.8h .else smlal v4.4s, v0.4h, v30.4h smlal2 v5.4s, v0.8h, v30.8h .endif .endif .if \uv_layout && \elems == 8 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 7 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 1 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: .if \elems > 4 .ifc \edge, left increment_seed 4 read_rand x12, 11, 3 read_rand x13, 11, 2 read_rand x14, 11, 1 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v0.h}[5], [x12] ld1 {v0.h}[6], [x13] ld1 {v0.h}[7], [x14] lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 srshl v0.8h, v0.8h, v31.8h ext v4.16b, v4.16b, v4.16b, #12 .ifc \lag, lag3 smov w17, v0.h[5] .endif .ifnc \lag, lag1 smov w16, v0.h[6] .endif smov w14, v0.h[7] mov v1.16b, v4.16b mov w15, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 mov v1.16b, v4.16b mov w15, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 mov v1.16b, v5.16b .ifc \edge, right mov w15, #3 bl output_\lag\()_neon read_shift_rand x15, 11 add x15, x3, x15, lsl #1 ld1 {v1.h}[0], [x15] srshl v1.4h, v1.4h, v31.4h ext v0.16b, v0.16b, v1.16b, #2 .else mov w15, #4 bl output_\lag\()_neon .endif .else // elems == 1 increment_seed 4, shift=0 mov v1.16b, v4.16b mov w15, #1 bl output_\lag\()_neon lsr w2, w2, #3 read_rand x12, 11, 2 read_rand x13, 11, 1 read_rand x14, 11, 0 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v1.h}[0], [x12] ld1 {v1.h}[1], [x13] ld1 {v1.h}[2], [x14] srshl v1.4h, v1.4h, v31.4h ext v0.16b, v0.16b, v1.16b, #14 .endif st1 {v0.8h}, [x0], #16 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=8 function sum_\type\()_lag1_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x12, x0, #1*GRAIN_WIDTH*2 ld1 {v17.8h}, [x12] // load the previous block right above .endif sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 7 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 7 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 1 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 1 function sum_lag2_above_neon sub x12, x0, #2*GRAIN_WIDTH*2 - 16 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v18.8h}, [x12] // load top right ld1 {v21.8h}, [x13] dup v26.8b, v30.b[0] ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid dup v27.8b, v30.b[1] ext v23.16b, v16.16b, v17.16b, #14 sxtl v26.8h, v26.8b dup v28.8b, v30.b[3] ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right sxtl v27.8h, v27.8b dup v29.8b, v30.b[4] ext v1.16b, v17.16b, v18.16b, #4 sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b smull v4.4s, v22.4h, v26.4h smlal v4.4s, v23.4h, v27.4h smlal v4.4s, v0.4h, v28.4h smlal v4.4s, v1.4h, v29.4h smull2 v5.4s, v22.8h, v26.8h smlal2 v5.4s, v23.8h, v27.8h smlal2 v5.4s, v0.8h, v28.8h smlal2 v5.4s, v1.8h, v29.8h dup v26.16b, v30.b[5] ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid dup v27.16b, v30.b[6] ext v23.16b, v19.16b, v20.16b, #14 sxtl v26.8h, v26.8b dup v28.16b, v30.b[8] ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right sxtl v27.8h, v27.8b dup v29.16b, v30.b[9] ext v1.16b, v20.16b, v21.16b, #4 sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b smlal v4.4s, v22.4h, v26.4h smlal v4.4s, v23.4h, v27.4h smlal v4.4s, v0.4h, v28.4h smlal v4.4s, v1.4h, v29.4h smlal2 v5.4s, v22.8h, v26.8h smlal2 v5.4s, v23.8h, v27.8h smlal2 v5.4s, v0.8h, v28.8h smlal2 v5.4s, v1.8h, v29.8h dup v26.16b, v30.b[2] dup v27.16b, v30.b[7] sxtl v26.8h, v26.8b sxtl v27.8h, v27.8b smlal v4.4s, v17.4h, v26.4h smlal v4.4s, v20.4h, v27.4h smlal2 v5.4s, v17.8h, v26.8h smlal2 v5.4s, v20.8h, v27.8h mov v16.16b, v17.16b mov v17.16b, v18.16b mov v19.16b, v20.16b mov v20.16b, v21.16b ret endfunc .macro sum_lag2_func type, uv_layout, edge, elems=8 function sum_\type\()_lag2_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x12, x0, #2*GRAIN_WIDTH*2 sub x13, x0, #1*GRAIN_WIDTH*2 ld1 {v17.8h}, [x12] // load the previous block right above ld1 {v20.8h}, [x13] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 7 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 7 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 1 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 1 function sum_lag3_above_neon sub x11, x0, #3*GRAIN_WIDTH*2 - 16 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v15.8h}, [x11] // load top right ld1 {v18.8h}, [x12] ld1 {v21.8h}, [x13] dup v22.8b, v29.b[0] ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid dup v23.8b, v29.b[1] ext v9.16b, v13.16b, v14.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v29.b[2] sxtl v23.8h, v23.8b dup v25.8b, v29.b[3] ext v10.16b, v13.16b, v14.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v29.b[4] ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v29.b[5] ext v12.16b, v14.16b, v15.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v29.b[6] ext v13.16b, v14.16b, v15.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smull v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v14.4h, v25.4h smull2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v14.8h, v25.8h dup v22.8b, v29.b[7] ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid dup v23.8b, v29.b[8] ext v9.16b, v16.16b, v17.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v29.b[9] sxtl v23.8h, v23.8b dup v25.8b, v29.b[10] ext v10.16b, v16.16b, v17.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v29.b[11] ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v29.b[12] ext v12.16b, v17.16b, v18.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v29.b[13] ext v13.16b, v17.16b, v18.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smlal v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v17.4h, v25.4h smlal2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v17.8h, v25.8h dup v22.8b, v29.b[14] ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid dup v23.8b, v29.b[15] ext v9.16b, v19.16b, v20.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v30.b[0] sxtl v23.8h, v23.8b dup v25.8b, v30.b[1] ext v10.16b, v19.16b, v20.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v30.b[2] ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v30.b[3] ext v12.16b, v20.16b, v21.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v30.b[4] ext v13.16b, v20.16b, v21.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smlal v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v20.4h, v25.4h mov v16.16b, v17.16b mov v17.16b, v18.16b smlal2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v20.8h, v25.8h mov v13.16b, v14.16b mov v14.16b, v15.16b mov v19.16b, v20.16b mov v20.16b, v21.16b ret endfunc .macro sum_lag3_func type, uv_layout, edge, elems=8 function sum_\type\()_lag3_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x11, x0, #3*GRAIN_WIDTH*2 sub x12, x0, #2*GRAIN_WIDTH*2 sub x13, x0, #1*GRAIN_WIDTH*2 ld1 {v14.8h}, [x11] // load the previous block right above ld1 {v17.8h}, [x12] ld1 {v20.8h}, [x13] .endif sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 7 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 7 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 1 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 1 function generate_grain_rows_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: mov w16, #80 2: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h subs w16, w16, #8 st1 {v0.8h}, [x0], #16 b.gt 2b get_grain_2 v0 subs w1, w1, #1 st1 {v0.s}[0], [x0], #4 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function generate_grain_rows_44_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: mov w16, #40 2: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h subs w16, w16, #8 st1 {v0.8h}, [x0], #16 b.gt 2b get_grain_4 v0 subs w1, w1, #1 st1 {v0.4h}, [x0] add x0, x0, #GRAIN_WIDTH*2-80 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function gen_grain_uv_444_lag0_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v4.8h}, [x19], #16 gen_grain_uv_lag0_8_start: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h gen_grain_uv_lag0_8_add: and v4.16b, v4.16b, v1.16b smull v2.4s, v4.4h, v27.4h smull2 v3.4s, v4.8h, v27.8h srshl v2.4s, v2.4s, v28.4s srshl v3.4s, v3.4s, v28.4s sqxtn v2.4h, v2.4s sqxtn2 v2.8h, v3.4s sqadd v2.8h, v2.8h, v0.8h smin v2.8h, v2.8h, v25.8h smax v2.8h, v2.8h, v26.8h st1 {v2.8h}, [x0], #16 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function gen_grain_uv_420_lag0_8_neon AARCH64_SIGN_LINK_REGISTER add x12, x19, #GRAIN_WIDTH*2 str x30, [sp, #-16]! ld1 {v16.8h, v17.8h}, [x19], #32 ld1 {v18.8h, v19.8h}, [x12] addp v16.8h, v16.8h, v17.8h addp v17.8h, v18.8h, v19.8h add v16.8h, v16.8h, v17.8h srshr v4.8h, v16.8h, #2 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_422_lag0_8_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.8h, v17.8h}, [x19], #32 addp v16.8h, v16.8h, v17.8h srshr v4.8h, v16.8h, #1 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_420_lag0_4_neon add x12, x19, #GRAIN_WIDTH*2 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.4h, v17.4h}, [x19] ld1 {v18.4h, v19.4h}, [x12] add x19, x19, #32 addp v16.4h, v16.4h, v17.4h addp v17.4h, v18.4h, v19.4h add v16.4h, v16.4h, v17.4h srshr v4.4h, v16.4h, #2 get_grain_4 v0 b gen_grain_uv_lag0_8_add endfunc function gen_grain_uv_422_lag0_4_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.4h, v17.4h}, [x19] add x19, x19, #32 addp v16.4h, v16.4h, v17.4h srshr v4.4h, v16.4h, #1 get_grain_4 v0 b gen_grain_uv_lag0_8_add endfunc .macro gen_grain_82 type function generate_grain_\type\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! .ifc \type, uv_444 mov w13, w3 mov w14, #28 add x19, x1, #3*GRAIN_WIDTH*2 mov x1, x2 mul w13, w13, w14 clz w15, w4 .else clz w15, w2 .endif movrel x3, X(gaussian_sequence) sub w15, w15, #24 // -bitdepth_min_8 ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add x4, x1, #FGD_AR_COEFFS_Y .else add x4, x1, #FGD_AR_COEFFS_UV .endif add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 adr x16, L(gen_grain_\type\()_tbl) ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrh w17, [x16, w17, uxtw #1] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift sub x16, x16, w17, uxtw neg v31.8h, v31.8h .ifc \type, uv_444 cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne .endif ldr w7, [x1, #FGD_AR_COEFF_SHIFT] neg w15, w15 // bitdepth_min_8 mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #128 lsl w5, w5, w15 // 128 << bitdepth_min_8 neg w6, w5 // -(128 << bitpdeth_min_8) sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 .ifc \type, uv_444 eor w2, w2, w11 .endif br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET .ifc \type, y mov w1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else dup v28.4s, w7 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 dup v25.8h, w5 dup v26.8h, w6 ext v29.16b, v0.16b, v1.16b, #10 ext v30.16b, v1.16b, v0.16b, #2 neg v28.4s, v28.4s sxtl v27.8h, v27.8b mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT-3 1: mov v1.16b, v29.16b bl gen_grain_uv_444_lag0_neon // 8 movi v1.16b, #255 bl gen_grain_uv_444_lag0_neon // 16 bl gen_grain_uv_444_lag0_neon // 24 bl gen_grain_uv_444_lag0_neon // 32 bl gen_grain_uv_444_lag0_neon // 40 bl gen_grain_uv_444_lag0_neon // 48 bl gen_grain_uv_444_lag0_neon // 56 bl gen_grain_uv_444_lag0_neon // 64 bl gen_grain_uv_444_lag0_neon // 72 mov v1.16b, v30.16b bl gen_grain_uv_444_lag0_neon // 80 get_grain_2 v16 subs w1, w1, #1 add x19, x19, #4 st1 {v16.s}[0], [x0], #4 b.gt 1b .endif ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] ld1r {v29.8b}, [x4] // ar_coeffs_y[2] .ifc \type, y ldrsb w4, [x4, #1] // ar_coeffs_y[3] .else add x4, x4, #2 .endif mov w1, #3 .ifc \type, uv_444 ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b .ifc \type, uv_444 sxtl v30.8h, v30.8b .endif mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_mid_neon // 48 bl sum_\type\()_lag1_mid_neon // 56 bl sum_\type\()_lag1_mid_neon // 64 bl sum_\type\()_lag1_mid_neon // 72 bl sum_\type\()_lag1_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_mid_neon // 48 bl sum_\type\()_lag2_mid_neon // 56 bl sum_\type\()_lag2_mid_neon // 64 bl sum_\type\()_lag2_mid_neon // 72 bl sum_\type\()_lag2_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_mid_neon // 48 bl sum_\type\()_lag3_mid_neon // 56 bl sum_\type\()_lag3_mid_neon // 64 bl sum_\type\()_lag3_mid_neon // 72 bl sum_\type\()_lag3_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(gen_grain_\type\()_tbl): .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) .else sub \reg, \reg, #6*32-GRAIN_WIDTH*2 .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! mov w13, w3 mov w14, #28 add x19, x1, #(3*GRAIN_WIDTH-3)*2 mov x1, x2 mul w13, w13, w14 clz w15, w4 movrel x3, X(gaussian_sequence) sub w15, w15, #24 // -bitdepth_min_8 ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 adr x16, L(gen_grain_\type\()_tbl) ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrh w17, [x16, w17, uxtw #1] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift sub x16, x16, w17, uxtw neg v31.8h, v31.8h cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne ldr w7, [x1, #FGD_AR_COEFF_SHIFT] neg w15, w15 // bitdepth_min_8 mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #128 lsl w5, w5, w15 // 128 << bitdepth_min_8 neg w6, w5 // -(128 << bitpdeth_min_8) sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 eor w2, w2, w11 br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET dup v28.4s, w7 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 dup v25.8h, w5 dup v26.8h, w6 ext v29.16b, v0.16b, v1.16b, #10 ext v30.16b, v1.16b, v0.16b, #14 neg v28.4s, v28.4s sxtl v27.8h, v27.8b mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: mov v1.16b, v29.16b bl gen_grain_\type\()_lag0_8_neon // 8 movi v1.16b, #255 bl gen_grain_\type\()_lag0_8_neon // 16 bl gen_grain_\type\()_lag0_8_neon // 24 bl gen_grain_\type\()_lag0_8_neon // 32 bl gen_grain_\type\()_lag0_8_neon // 40 mov v1.16b, v30.16b bl gen_grain_\type\()_lag0_4_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] add x4, x4, #2 mov w1, #3 ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b sxtl v30.8h, v30.8b set_height w1, \type 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ldr q29, [x4] // ar_coeffs_uv[0-15] ldr q30, [x4, #16] // ar_coeffs_uv[16-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(gen_grain_\type\()_tbl): .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) endfunc .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, off umov w14, \src1[0] umov w15, \src2[1] umov w16, \src1[2] add x14, x14, x3 umov w17, \src2[3] add x15, x15, x3 ld1 {\dst1}[0+\off], [x14] umov w14, \src1[4] add x16, x16, x3 ld1 {\dst2}[1+\off], [x15] umov w15, \src2[5] add x17, x17, x3 ld1 {\dst1}[2+\off], [x16] umov w16, \src1[6] add x14, x14, x3 ld1 {\dst2}[3+\off], [x17] umov w17, \src2[7] add x15, x15, x3 ld1 {\dst1}[4+\off], [x14] add x16, x16, x3 ld1 {\dst2}[5+\off], [x15] add x17, x17, x3 ld1 {\dst1}[6+\off], [x16] ld1 {\dst2}[7+\off], [x17] .endm .macro gather dst1, dst2, src1, src2, src3, src4 gather_interleaved \dst1, \dst2, \src1, \src3, 0 gather_interleaved \dst2, \dst1, \src3, \src1, 0 gather_interleaved \dst1, \dst2, \src2, \src4, 8 gather_interleaved \dst2, \dst1, \src4, \src2, 8 .endm function gather32_neon gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h ret endfunc function gather16_neon gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 ins v6.d[1], v7.d[0] ret endfunc const overlap_coeffs_0, align=4 .short 27, 17, 0, 0 .short 17, 27, 32, 32 endconst const overlap_coeffs_1, align=4 .short 23, 0, 0, 0 .short 22, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx, uxtw #1 // grain_lut += offx .endm // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type, // const int bitdepth_max); function fgy_32x32_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-80]! stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] str d14, [sp, #64] eor w4, w4, #15 // 15 - scaling_shift ldr w11, [x6, #8] // offsets[1][0] ldr w13, [x6, #4] // offsets[0][1] ldr w15, [x6, #12] // offsets[1][1] ldr w10, [sp, #96] // bitdepth_max ldr w6, [x6] // offsets[0][0] dup v26.8h, w10 // bitdepth_max clz w10, w10 ldr w8, [sp, #80] // clip sub w10, w10, #24 // -bitdepth_min_8 mov x9, #GRAIN_WIDTH*2 // grain_lut stride neg w10, w10 // bitdepth_min_8 dup v29.8h, w4 // 15 - scaling_shift dup v27.8h, w10 // bitdepth_min_8 movrel x16, overlap_coeffs_0 cbz w8, 1f // clip movi v30.8h, #16 movi v31.8h, #235 sshl v30.8h, v30.8h, v27.8h sshl v31.8h, v31.8h, v27.8h b 2f 1: // no clip movi v30.8h, #0 mov v31.16b, v26.16b // bitdepth_max 2: ushr v26.8h, v26.8h, #1 // grain_max not v25.16b, v26.16b // grain_min ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs add x5, x5, #18 // grain_lut += 9 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x9 // grain_lut += grain_stride calc_offset w11, w12, w11, 0, 0 calc_offset w13, w14, w13, 0, 0 calc_offset w15, w16, w15, 0, 0 calc_offset w6, w10, w6, 0, 0 add_offset x12, w11, x12, x5, x9 add_offset x14, w13, x14, x5, x9 add_offset x16, w15, x16, x5, x9 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #88] // type adr x13, L(fgy_loop_tbl) add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by tst w11, #1 ldrh w11, [x13, w11, uxtw #1] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx sub x11, x13, w11, uxtw b.eq 1f // y overlap dup v8.8h, v27.h[0] dup v9.8h, v27.h[1] mov w10, w7 // backup actual h mov w7, #2 1: br x11 endfunc function fgy_loop_neon .macro fgy ox, oy L(loop_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src .if \ox ld1 {v20.4h}, [x4], x9 // grain_lut old .endif .if \oy ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top .endif .if \ox && \oy ld1 {v14.4h}, [x8], x9 // grain_lut top old .endif mvni v4.8h, #0xf0, lsl #8 // 0x0fff ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v4.16b and v1.16b, v1.16b, v4.16b and v2.16b, v2.16b, v4.16b and v3.16b, v3.16b, v4.16b bl gather32_neon .if \ox smull v20.4s, v20.4h, v27.4h smlal v20.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v14.4s, v14.4h, v27.4h smlal v14.4s, v21.4h, v28.4h sqrshrn v20.4h, v20.4s, #5 sqrshrn v14.4h, v14.4s, #5 smin v20.4h, v20.4h, v26.4h smin v14.4h, v14.4h, v26.4h smax v20.4h, v20.4h, v25.4h smax v14.4h, v14.4h, v25.4h .endif .if \ox smull v10.4s, v20.4h, v9.4h .else smull v10.4s, v16.4h, v9.4h .endif smull2 v11.4s, v16.8h, v9.8h smull v12.4s, v17.4h, v9.4h smull2 v13.4s, v17.8h, v9.8h smull v16.4s, v18.4h, v9.4h smull2 v17.4s, v18.8h, v9.8h smull v18.4s, v19.4h, v9.4h smull2 v19.4s, v19.8h, v9.8h .if \ox smlal v10.4s, v14.4h, v8.4h .else smlal v10.4s, v21.4h, v8.4h .endif smlal2 v11.4s, v21.8h, v8.8h smlal v12.4s, v22.4h, v8.4h smlal2 v13.4s, v22.8h, v8.8h smlal v16.4s, v23.4h, v8.4h smlal2 v17.4s, v23.8h, v8.8h smlal v18.4s, v24.4h, v8.4h smlal2 v19.4s, v24.8h, v8.8h sqrshrn v10.4h, v10.4s, #5 sqrshrn2 v10.8h, v11.4s, #5 sqrshrn v11.4h, v12.4s, #5 sqrshrn2 v11.8h, v13.4s, #5 sqrshrn v12.4h, v16.4s, #5 sqrshrn2 v12.8h, v17.4s, #5 sqrshrn v13.4h, v18.4s, #5 sqrshrn2 v13.8h, v19.4s, #5 smin v16.8h, v10.8h, v26.8h smin v17.8h, v11.8h, v26.8h smin v18.8h, v12.8h, v26.8h smin v19.8h, v13.8h, v26.8h smax v16.8h, v16.8h, v25.8h smax v17.8h, v17.8h, v25.8h smax v18.8h, v18.8h, v25.8h smax v19.8h, v19.8h, v25.8h .endif uxtl v4.8h, v6.8b // scaling .if \ox && !\oy sqrshrn v20.4h, v20.4s, #5 .endif uxtl2 v5.8h, v6.16b .if \ox && !\oy smin v20.4h, v20.4h, v26.4h .endif uxtl v6.8h, v7.8b .if \ox && !\oy smax v20.4h, v20.4h, v25.4h .endif uxtl2 v7.8h, v7.16b .if \ox && !\oy ins v16.d[0], v20.d[0] .endif ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h ushl v6.8h, v6.8h, v29.8h ushl v7.8h, v7.8h, v29.8h sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v21.8h, v17.8h, v5.8h sqrdmulh v22.8h, v18.8h, v6.8h sqrdmulh v23.8h, v19.8h, v7.8h usqadd v0.8h, v20.8h // *src + noise usqadd v1.8h, v21.8h usqadd v2.8h, v22.8h usqadd v3.8h, v23.8h umax v0.8h, v0.8h, v30.8h umax v1.8h, v1.8h, v30.8h umax v2.8h, v2.8h, v30.8h umax v3.8h, v3.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h subs w7, w7, #1 .if \oy dup v8.8h, v28.h[0] dup v9.8h, v28.h[1] .endif st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w10, #2 sub w7, w10, #2 // restore actual remaining h b.gt L(loop_\ox\()0) .endif ldr d14, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 L(fgy_loop_tbl): .hword L(fgy_loop_tbl) - L(loop_00) .hword L(fgy_loop_tbl) - L(loop_01) .hword L(fgy_loop_tbl) - L(loop_10) .hword L(fgy_loop_tbl) - L(loop_11) endfunc // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav1dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type, // const int bitdepth_max); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-80]! stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] ldp x8, x9, [sp, #80] // offsets, h ldp x10, x11, [sp, #96] // uv, is_id ldr w16, [sp, #120] // bitdepth_max ldr w13, [x4, #FGD_SCALING_SHIFT] ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] dup v23.8h, w16 // bitdepth_max clz w16, w16 eor w13, w13, #15 // 15 - scaling_shift sub w16, w16, #24 // -bitdepth_min_8 // !csfl add x10, x4, x10, lsl #2 // + 4*uv add x14, x10, #FGD_UV_LUMA_MULT add x15, x10, #FGD_UV_MULT add x10, x10, #FGD_UV_OFFSET neg w16, w16 // bitdepth_min_8 ld1r {v8.8h}, [x14] // uv_luma_mult ld1r {v24.8h}, [x10] // uv_offset ld1r {v9.8h}, [x15] // uv_mult dup v29.8h, w13 // 15 - scaling_shift dup v27.8h, w16 // bitdepth_min_8 cbz w12, 1f // clip movi v30.8h, #16 movi v31.8h, #240 sshl v30.8h, v30.8h, v27.8h sshl v31.8h, v31.8h, v27.8h cbz w11, 2f // is_id movi v31.8h, #235 sshl v31.8h, v31.8h, v27.8h b 2f 1: // no clip movi v30.8h, #0 mov v31.16b, v23.16b // bitdepth_max 2: ushr v15.8h, v23.8h, #1 // grain_max sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 not v14.16b, v15.16b // grain_min ldr w12, [x8, #8] // offsets[1][0] ldr w14, [x8, #4] // offsets[0][1] ldr w16, [x8, #12] // offsets[1][1] ldr w8, [x8] // offsets[0][0] mov x10, #GRAIN_WIDTH*2 // grain_lut stride add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 .if \sy add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride .else add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x10 // grain_lut += grain_stride .endif calc_offset w12, w13, w12, \sx, \sy calc_offset w14, w15, w14, \sx, \sy calc_offset w16, w17, w16, \sx, \sy calc_offset w8, w11, w8, \sx, \sy add_offset x13, w12, x13, x5, x10 add_offset x15, w14, x15, x5, x10 add_offset x17, w16, x17, x5, x10 add_offset x5, w8, x11, x5, x10 add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx ldr w13, [sp, #112] // type movrel x16, overlap_coeffs_\sx adr x14, L(fguv_loop_sx\sx\()_tbl) ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs tst w13, #1 ldrh w13, [x14, w13, uxtw #1] b.eq 1f // y overlap sub w12, w9, #(2 >> \sy) // backup remaining h mov w9, #(2 >> \sy) 1: sub x13, x14, w13, uxtw .if \sy movi v25.8h, #23 movi v26.8h, #22 .else movi v25.8h, #27 movi v26.8h, #17 .endif .if \sy add x7, x7, x7 // luma_stride *= 2 .endif br x13 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: .if \ox ld1 {v4.4h}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v5.4h}, [x11], x10 // grain_lut top old .endif ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut .if \ox smull v4.4s, v4.4h, v27.4h smlal v4.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v5.4s, v5.4h, v27.4h smlal v5.4s, v0.4h, v28.4h sqrshrn v4.4h, v4.4s, #5 sqrshrn v5.4h, v5.4s, #5 smin v4.4h, v4.4h, v15.4h smin v5.4h, v5.4h, v15.4h smax v4.4h, v4.4h, v14.4h smax v5.4h, v5.4h, v14.4h ins v16.d[0], v4.d[0] ins v0.d[0], v5.d[0] .endif smull v6.4s, v16.4h, v26.4h smull2 v7.4s, v16.8h, v26.8h smull v10.4s, v17.4h, v26.4h smull2 v11.4s, v17.8h, v26.8h smull v16.4s, v18.4h, v26.4h smull2 v17.4s, v18.8h, v26.8h smull v18.4s, v19.4h, v26.4h smull2 v19.4s, v19.8h, v26.8h smlal v6.4s, v0.4h, v25.4h smlal2 v7.4s, v0.8h, v25.8h smlal v10.4s, v1.4h, v25.4h smlal2 v11.4s, v1.8h, v25.8h smlal v16.4s, v2.4h, v25.4h smlal2 v17.4s, v2.8h, v25.8h smlal v18.4s, v3.4h, v25.4h smlal2 v19.4s, v3.8h, v25.8h sqrshrn v6.4h, v6.4s, #5 sqrshrn2 v6.8h, v7.4s, #5 sqrshrn v7.4h, v10.4s, #5 sqrshrn2 v7.8h, v11.4s, #5 sqrshrn v10.4h, v16.4s, #5 sqrshrn2 v10.8h, v17.4s, #5 sqrshrn v11.4h, v18.4s, #5 sqrshrn2 v11.8h, v19.4s, #5 .endif .if \ox && !\oy sqrshrn v4.4h, v4.4s, #5 smin v4.4h, v4.4h, v15.4h .endif ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma .if \oy smin v16.8h, v6.8h, v15.8h smin v17.8h, v7.8h, v15.8h smin v18.8h, v10.8h, v15.8h smin v19.8h, v11.8h, v15.8h smax v16.8h, v16.8h, v14.8h smax v17.8h, v17.8h, v14.8h smax v18.8h, v18.8h, v14.8h smax v19.8h, v19.8h, v14.8h .endif .if \ox && !\oy smax v4.4h, v4.4h, v14.4h .endif ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src .if \ox && !\oy ins v16.d[0], v4.d[0] .endif .if !\csfl smull v4.4s, v0.4h, v8.4h smull2 v5.4s, v0.8h, v8.8h smull v6.4s, v1.4h, v8.4h smull2 v7.4s, v1.8h, v8.8h smull v0.4s, v2.4h, v8.4h smull2 v1.4s, v2.8h, v8.8h smull v2.4s, v3.4h, v8.4h smull2 v3.4s, v3.8h, v8.8h smlal v4.4s, v10.4h, v9.4h smlal2 v5.4s, v10.8h, v9.8h smlal v6.4s, v11.4h, v9.4h smlal2 v7.4s, v11.8h, v9.8h smlal v0.4s, v12.4h, v9.4h smlal2 v1.4s, v12.8h, v9.8h smlal v2.4s, v13.4h, v9.4h smlal2 v3.4s, v13.8h, v9.8h shrn v4.4h, v4.4s, #6 shrn2 v4.8h, v5.4s, #6 shrn v5.4h, v6.4s, #6 shrn2 v5.8h, v7.4s, #6 shrn v6.4h, v0.4s, #6 shrn2 v6.8h, v1.4s, #6 shrn v7.4h, v2.4s, #6 shrn2 v7.8h, v3.4s, #6 add v0.8h, v4.8h, v24.8h add v1.8h, v5.8h, v24.8h add v2.8h, v6.8h, v24.8h add v3.8h, v7.8h, v24.8h movi v20.8h, #0 smin v0.8h, v0.8h, v23.8h smin v1.8h, v1.8h, v23.8h smin v2.8h, v2.8h, v23.8h smin v3.8h, v3.8h, v23.8h smax v0.8h, v0.8h, v20.8h smax v1.8h, v1.8h, v20.8h smax v2.8h, v2.8h, v20.8h smax v3.8h, v3.8h, v20.8h .else // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v23.16b and v1.16b, v1.16b, v23.16b and v2.16b, v2.16b, v23.16b and v3.16b, v3.16b, v23.16b .endif bl gather32_neon uxtl v4.8h, v6.8b // scaling uxtl2 v5.8h, v6.16b uxtl v6.8h, v7.8b uxtl2 v7.8h, v7.16b ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h ushl v6.8h, v6.8h, v29.8h ushl v7.8h, v7.8h, v29.8h sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v17.8h, v17.8h, v5.8h sqrdmulh v18.8h, v18.8h, v6.8h sqrdmulh v19.8h, v19.8h, v7.8h usqadd v10.8h, v16.8h // *src + noise usqadd v11.8h, v17.8h usqadd v12.8h, v18.8h usqadd v13.8h, v19.8h umax v0.8h, v10.8h, v30.8h umax v1.8h, v11.8h, v30.8h umax v2.8h, v12.8h, v30.8h umax v3.8h, v13.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h subs w9, w9, #1 .if \oy dup v25.8h, v28.h[0] dup v26.8h, v28.h[1] .endif st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret L(fguv_loop_sx0_tbl): .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) endfunc function fguv_loop_sx1_neon .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: .if \ox ld1 {v18.4h}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v19.4h}, [x11], x10 // grain_lut top old .endif ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut .if \ox smull v18.4s, v18.4h, v27.4h smlal v18.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v19.4s, v19.4h, v27.4h smlal v19.4s, v20.4h, v28.4h sqrshrn v18.4h, v18.4s, #5 sqrshrn v19.4h, v19.4s, #5 smin v18.4h, v18.4h, v15.4h smin v19.4h, v19.4h, v15.4h smax v18.4h, v18.4h, v14.4h smax v19.4h, v19.4h, v14.4h ins v16.d[0], v18.d[0] ins v20.d[0], v19.d[0] .endif smull v0.4s, v16.4h, v26.4h smull2 v1.4s, v16.8h, v26.8h smull v2.4s, v17.4h, v26.4h smull2 v3.4s, v17.8h, v26.8h smlal v0.4s, v20.4h, v25.4h smlal2 v1.4s, v20.8h, v25.8h smlal v2.4s, v21.4h, v25.4h smlal2 v3.4s, v21.8h, v25.8h sqrshrn v16.4h, v0.4s, #5 sqrshrn2 v16.8h, v1.4s, #5 sqrshrn v17.4h, v2.4s, #5 sqrshrn2 v17.8h, v3.4s, #5 .endif .if \ox && !\oy sqrshrn v18.4h, v18.4s, #5 smin v18.4h, v18.4h, v15.4h .endif ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma .if \oy smin v16.8h, v16.8h, v15.8h smin v17.8h, v17.8h, v15.8h smax v16.8h, v16.8h, v14.8h smax v17.8h, v17.8h, v14.8h .endif .if \ox && !\oy smax v18.4h, v18.4h, v14.4h .endif ld1 {v10.8h, v11.8h}, [x1], x2 // src .if \ox && !\oy ins v16.d[0], v18.d[0] .endif addp v0.8h, v0.8h, v1.8h addp v1.8h, v2.8h, v3.8h urshr v0.8h, v0.8h, #1 urshr v1.8h, v1.8h, #1 .if !\csfl smull v2.4s, v0.4h, v8.4h smull2 v3.4s, v0.8h, v8.8h smull v0.4s, v1.4h, v8.4h smull2 v1.4s, v1.8h, v8.8h smlal v2.4s, v10.4h, v9.4h smlal2 v3.4s, v10.8h, v9.8h smlal v0.4s, v11.4h, v9.4h smlal2 v1.4s, v11.8h, v9.8h shrn v2.4h, v2.4s, #6 shrn2 v2.8h, v3.4s, #6 shrn v3.4h, v0.4s, #6 shrn2 v3.8h, v1.4s, #6 add v0.8h, v2.8h, v24.8h add v1.8h, v3.8h, v24.8h movi v2.8h, #0 smin v0.8h, v0.8h, v23.8h smin v1.8h, v1.8h, v23.8h smax v0.8h, v0.8h, v2.8h smax v1.8h, v1.8h, v2.8h .else // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v23.16b and v1.16b, v1.16b, v23.16b .endif bl gather16_neon uxtl v4.8h, v6.8b // scaling uxtl2 v5.8h, v6.16b ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v17.8h, v17.8h, v5.8h usqadd v10.8h, v16.8h // *src + noise usqadd v11.8h, v17.8h umax v0.8h, v10.8h, v30.8h umax v1.8h, v11.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h .if \oy mov v16.16b, v25.16b .endif subs w9, w9, #1 .if \oy mov v25.16b, v26.16b mov v26.16b, v16.16b .endif st1 {v0.8h, v1.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret L(fguv_loop_sx1_tbl): .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) endfunc