/* * Copyright © 2021, VideoLAN and dav1d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr r11, r2, #3 lsr r12, r2, #12 lsr lr, r2, #1 eor r11, r2, r11 // (r >> 0) ^ (r >> 3) eor r12, r12, lr // (r >> 12) ^ (r >> 1) eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr r2, r2, #\steps .endif and r11, r11, #((1 << \steps) - 1) // bit .if \shift orr r2, r2, r11, lsl #(16 - \steps) // *state .else orr r2, r2, r11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, r2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, r2, #17 - \bits, #\bits lsr r2, r2, #1 .endm // special calling convention: // r2 holds seed // r3 holds dav1d_gaussian_sequence // clobbers r11-r12 // returns in d0-d1 function get_gaussian_neon push {r5-r6,lr} increment_seed 4 read_rand r5, 11, 3 read_rand r6, 11, 2 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d0[0]}, [r5] read_rand r5, 11, 1 vld1.16 {d0[1]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 0 increment_seed 4 add r6, r3, r6, lsl #1 vld1.16 {d0[2]}, [r5] read_rand r5, 11, 3 vld1.16 {d0[3]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 2 vld1.16 {d1[0]}, [r5] add r6, r3, r6, lsl #1 read_rand r5, 11, 1 vld1.16 {d1[1]}, [r6] read_rand r6, 11, 0 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d1[2]}, [r5] vld1.16 {d1[3]}, [r6] pop {r5-r6,pc} endfunc function get_grain_2_neon push {r11,lr} increment_seed 2 read_rand r11, 11, 1 read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] vld1.16 {d0[1]}, [r12] vrshl.s16 d0, d0, d30 pop {r11,pc} endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, d0 vmov \dst, d0 .endif .endm function get_grain_4_neon push {r11,lr} increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] read_rand r11, 11, 1 vld1.16 {d0[1]}, [r12] read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[2]}, [r11] vld1.16 {d0[3]}, [r12] vrshl.s16 d0, d0, d30 pop {r11,pc} endfunc .macro get_grain_4 dst bl get_grain_4_neon .ifnc \dst, d0 vmov \dst, d0 .endif .endm // r1 holds the number of entries to produce // r6, r8 and r10 hold the previous output entries // q0 holds the vector of produced entries // q1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon push {r0, lr} .if \n == 1 mvn lr, r5 // grain_min = ~grain_max .else mov r0, #1 mov lr, #1 sub r7, r7, #1 sub r9, r9, #1 lsl r0, r0, r7 lsl lr, lr, r9 add r7, r7, #1 add r9, r9, #1 .endif 1: read_shift_rand r12, 11 vmov.32 r11, d2[0] lsl r12, r12, #1 vext.8 q0, q0, q0, #2 ldrsh r12, [r3, r12] .if \n == 1 mla r11, r6, r4, r11 // sum (above) + *coeff * prev output add r6, r11, r8 // 1 << (ar_coeff_shift - 1) add r12, r12, r10 asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 cmp r6, r5 .elseif \n == 2 mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 mla r11, r6, r10, r11 // += *coeff * prev output 2 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mvn lr, r5 // grain_min = ~grain_max .else push {r1-r3} sbfx r1, r4, #0, #8 sbfx r2, r4, #8, #8 sbfx r3, r4, #16, #8 mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 mla r11, r6, r3, r11 // += *coeff * prev output 3 pop {r1-r3} mov r10, r8 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mvn lr, r5 // grain_min = ~grain_max .endif it gt movgt r6, r5 cmp r6, lr it lt movlt r6, lr .if \n >= 2 pop {lr} .endif subs r1, r1, #1 vext.8 q1, q1, q1, #4 vmov.16 d1[3], r6 bgt 1b pop {r0, pc} endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon sub r12, r0, #1*GRAIN_WIDTH*2 - 16 vld1.16 {q10}, [r12] // load top right vext.8 q0, q8, q9, #14 // top left, top mid vext.8 q1, q9, q10, #2 // top left, top mid vmull.s16 q2, d18, d28 vmlal.s16 q2, d0, d27 vmlal.s16 q2, d2, d29 vmull.s16 q3, d19, d28 vmlal.s16 q3, d1, d27 vmlal.s16 q3, d3, d29 vmov q8, q9 vmov q9, q10 bx lr endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff .ifc \lag\()_\edge, lag3_left bl sum_lag3_left_above_neon .else bl sum_\lag\()_above_neon .endif .ifc \type, uv_420 vpush {q6-q7} add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q0, q1}, [r11]! vld1.16 {q6, q7}, [r12]! vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d12, d12, d13 vpadd.i16 d13, d14, d15 vadd.i16 q0, q0, q6 vpop {q6-q7} vrshr.s16 q0, q0, #2 .endif .ifc \type, uv_422 vld1.16 {q0, q1}, [r11]! vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vrshr.s16 q0, q0, #1 .endif .ifc \type, uv_444 vld1.16 {q0}, [r11]! .endif .if \uv_layout .ifnb \uv_coeff vdup.8 d13, \uv_coeff vmovl.s8 q6, d13 .endif vmlal.s16 q2, d0, d13 vmlal.s16 q3, d1, d13 .endif .if \uv_layout && \elems == 8 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 7 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 1 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: push {r11} .if \elems > 4 .ifc \edge, left increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d1[1]}, [r11] read_rand r11, 11, 1 vld1.16 {d1[2]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d1[3]}, [r11] lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 vrshl.s16 d1, d1, d30 vext.8 q2, q2, q2, #12 .ifc \lag, lag3 vmov.s16 r10, d1[1] .endif .ifnc \lag, lag1 vmov.s16 r8, d1[2] .endif vmov.s16 r6, d1[3] vmov q1, q2 mov r1, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 vmov q1, q2 mov r1, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 vmov q1, q3 .ifc \edge, right mov r1, #3 bl output_\lag\()_neon read_shift_rand r12, 11 add r12, r3, r12, lsl #1 vld1.16 {d2[0]}, [r12] vrshl.s16 d2, d2, d30 vext.8 q0, q0, q1, #2 .else mov r1, #4 bl output_\lag\()_neon .endif .else // elems == 1 increment_seed 4, shift=0 vmov q1, q2 mov r1, #1 bl output_\lag\()_neon lsr r2, r2, #3 read_rand r11, 11, 2 read_rand r12, 11, 1 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d2[0]}, [r11] read_rand r11, 11, 0 vld1.16 {d2[1]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d2[2]}, [r11] vrshl.s16 d2, d2, d30 vext.8 q0, q0, q1, #14 .endif vst1.16 {q0}, [r0]! pop {r11} pop {r1, pc} .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=8 function sum_\type\()_lag1_\edge\()_neon push {r1, lr} .ifc \edge, left sub r12, r0, #1*GRAIN_WIDTH*2 vld1.8 {q9}, [r12] // load the previous block right above .endif sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 7 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 7 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 1 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 1 function sum_lag2_above_neon push {lr} sub r12, r0, #2*GRAIN_WIDTH*2 - 16 sub lr, r0, #1*GRAIN_WIDTH*2 - 16 vld1.16 {q10}, [r12] // load top right vld1.16 {q13}, [lr] vdup.8 d10, d28[0] vext.8 q0, q8, q9, #12 // top left, top mid vdup.8 d12, d28[1] vext.8 q1, q8, q9, #14 vdup.8 d14, d28[3] vext.8 q4, q9, q10, #2 // top mid, top right vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmull.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d8, d14 vmull.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d9, d14 vdup.8 d10, d28[4] vext.8 q0, q9, q10, #4 // top mid, top right vdup.8 d12, d28[5] vext.8 q1, q11, q12, #12 // top left, top mid vdup.8 d14, d28[6] vext.8 q4, q11, q12, #14 vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmlal.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d8, d14 vmlal.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d9, d14 vdup.8 d10, d29[0] vext.8 q0, q12, q13, #2 // top mid, top right vdup.8 d12, d29[1] vext.8 q1, q12, q13, #4 vdup.8 d14, d28[2] vdup.8 d8, d28[7] vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q4, d8 vmlal.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d18, d14 vmlal.s16 q2, d24, d8 vmlal.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d19, d14 vmlal.s16 q3, d25, d8 vmov q8, q9 vmov q9, q10 vmov q11, q12 vmov q12, q13 pop {pc} endfunc .macro sum_lag2_func type, uv_layout, edge, elems=8 function sum_\type\()_lag2_\edge\()_neon push {r1, lr} .ifc \edge, left sub r12, r0, #2*GRAIN_WIDTH*2 sub lr, r0, #1*GRAIN_WIDTH*2 vld1.16 {q9}, [r12] // load the previous block right above vld1.16 {q12}, [lr] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 7 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 7 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 1 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 1 function sum_lag3_left_above_neon // A separate codepath for the left edge, to avoid reading outside // of the edge of the buffer. sub r12, r0, #3*GRAIN_WIDTH*2 vld1.8 {q11, q12}, [r12] vext.8 q12, q11, q12, #10 vext.8 q11, q11, q11, #10 b sum_lag3_above_start endfunc function sum_lag3_above_neon movw r12, #(3*GRAIN_WIDTH + 3)*2 sub r12, r0, r12 vld1.8 {q11, q12}, [r12] sum_lag3_above_start: vdup.8 d12, d26[0] vext.8 q1, q11, q12, #2 vdup.8 d14, d26[1] vext.8 q4, q11, q12, #4 vdup.8 d16, d26[2] vext.8 q5, q11, q12, #6 vdup.8 d18, d26[3] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 movw r12, #(2*GRAIN_WIDTH + 3)*2 sub r12, r0, r12 vmull.s16 q2, d22, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmull.s16 q3, d23, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d26[4] vext.8 q0, q11, q12, #8 vdup.8 d14, d26[5] vext.8 q1, q11, q12, #10 vdup.8 d16, d26[6] vext.8 q4, q11, q12, #12 vld1.8 {q11, q12}, [r12] vdup.8 d18, d26[7] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d22, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d23, d18 vdup.8 d12, d27[0] vext.8 q0, q11, q12, #2 vdup.8 d14, d27[1] vext.8 q1, q11, q12, #4 vdup.8 d16, d27[2] vext.8 q4, q11, q12, #6 vdup.8 d18, d27[3] vext.8 q5, q11, q12, #8 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d27[4] vext.8 q0, q11, q12, #10 vdup.8 d14, d27[5] vext.8 q1, q11, q12, #12 vld1.8 {q11, q12}, [r12] vdup.8 d16, d27[6] vdup.8 d18, d27[7] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vext.8 q5, q11, q12, #2 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d22, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d23, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d28[0] vext.8 q0, q11, q12, #4 vdup.8 d14, d28[1] vext.8 q1, q11, q12, #6 vdup.8 d16, d28[2] vext.8 q4, q11, q12, #8 vdup.8 d18, d28[3] vext.8 q5, q11, q12, #10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d28[4] vext.8 q0, q11, q12, #12 vmovl.s8 q6, d12 vmlal.s16 q2, d0, d12 vmlal.s16 q3, d1, d12 bx lr endfunc .macro sum_lag3_func type, uv_layout, edge, elems=8 function sum_\type\()_lag3_\edge\()_neon push {r1, lr} sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 7 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 7 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 1 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 1 function generate_grain_rows_neon push {r10-r11,lr} 1: mov r10, #80 2: bl get_gaussian_neon vrshl.s16 q0, q0, q15 subs r10, r10, #8 vst1.16 {q0}, [r0]! bgt 2b get_grain_2 d0 subs r1, r1, #1 vst1.32 {d0[0]}, [r0]! bgt 1b pop {r10-r11,pc} endfunc function generate_grain_rows_44_neon push {r10-r11,lr} 1: mov r10, #40 2: bl get_gaussian_neon vrshl.s16 q0, q0, q15 subs r10, r10, #8 vst1.16 {q0}, [r0]! bgt 2b get_grain_4 d0 subs r1, r1, #1 vst1.16 {d0}, [r0] add r0, r0, #GRAIN_WIDTH*2-80 bgt 1b pop {r10-r11,pc} endfunc function gen_grain_uv_444_lag0_neon vld1.16 {q3}, [r11]! gen_grain_uv_lag0_8_start: push {r11,lr} bl get_gaussian_neon vrshl.s16 q0, q0, q15 gen_grain_uv_lag0_8_add: vand q3, q3, q1 vmull.s16 q2, d6, d22 vmull.s16 q3, d7, d22 vrshl.s32 q2, q2, q12 vrshl.s32 q3, q3, q12 vqmovn.s32 d4, q2 vqmovn.s32 d5, q3 vqadd.s16 q2, q2, q0 vmin.s16 q2, q2, q9 vmax.s16 q2, q2, q10 vst1.16 {q2}, [r0]! pop {r11,pc} endfunc function gen_grain_uv_420_lag0_8_neon add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q2,q3}, [r11]! vld1.16 {q4,q5}, [r12] vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d8, d8, d9 vpadd.i16 d9, d10, d11 vadd.i16 q2, q2, q4 vrshr.s16 q3, q2, #2 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_422_lag0_8_neon vld1.16 {q2,q3}, [r11]! vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vrshr.s16 q3, q2, #1 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_420_lag0_4_neon add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q2}, [r11] vld1.16 {q0}, [r12] add r11, r11, #32 vpadd.i16 d4, d4, d5 vpadd.i16 d0, d0, d1 vadd.i16 d4, d4, d0 vrshr.s16 d6, d4, #2 push {r11,lr} get_grain_4 d0 b gen_grain_uv_lag0_8_add endfunc function gen_grain_uv_422_lag0_4_neon vld1.16 {q2}, [r11] add r11, r11, #32 vpadd.i16 d4, d4, d5 vrshr.s16 d6, d4, #1 push {r11,lr} get_grain_4 d0 b gen_grain_uv_lag0_8_add endfunc .macro gen_grain_82 type function generate_grain_\type\()_16bpc_neon, export=1 push {r4-r11,lr} .ifc \type, uv_444 ldr r4, [sp, #36] mov r12, r3 mov lr, #28 add r11, r1, #3*GRAIN_WIDTH*2 mov r1, r2 mul r12, r12, lr clz lr, r4 .else clz lr, r2 .endif movrel r3, X(gaussian_sequence) sub lr, lr, #24 // -bitdepth_min_8 ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add r4, r1, #FGD_AR_COEFFS_Y .else add r4, r1, #FGD_AR_COEFFS_UV .endif add r9, r9, lr // grain_scale_shift - bitdepth_min_8 adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 .ifc \type, uv_444 push {lr} cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 pop {lr} .endif ldr r7, [r1, #FGD_AR_COEFF_SHIFT] neg lr, lr // bitdepth_min_8 mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, y mov r1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 mvn r6, r5 // grain_min = ~grain_max mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT-3 vdup.32 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vdup.16 q9, r5 vdup.16 q10, r6 vext.8 q13, q0, q1, #10 vext.8 q14, q1, q0, #2 vneg.s32 q12, q12 vmovl.s8 q11, d22 1: vmov q1, q13 bl gen_grain_uv_444_lag0_neon // 8 vmov.i8 q1, #255 bl gen_grain_uv_444_lag0_neon // 16 bl gen_grain_uv_444_lag0_neon // 24 bl gen_grain_uv_444_lag0_neon // 32 bl gen_grain_uv_444_lag0_neon // 40 bl gen_grain_uv_444_lag0_neon // 48 bl gen_grain_uv_444_lag0_neon // 56 bl gen_grain_uv_444_lag0_neon // 64 bl gen_grain_uv_444_lag0_neon // 72 vmov q1, q14 bl gen_grain_uv_444_lag0_neon // 80 get_grain_2 d16 subs r1, r1, #1 add r11, r11, #4 vst1.32 {d16[0]}, [r0]! bgt 1b .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] .ifc \type, y ldrsb r4, [r4, #1] // ar_coeffs_y[3] .else add r4, r4, #2 .endif mov r1, #3 .ifc \type, uv_444 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon vmovl.s8 q13, d27 vmovl.s8 q12, d29 vmovl.s8 q14, d28 vmov d29, d24 .ifc \type, uv_444 vmovl.s8 q6, d13 .endif mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_mid_neon // 48 bl sum_\type\()_lag1_mid_neon // 56 bl sum_\type\()_lag1_mid_neon // 64 bl sum_\type\()_lag1_mid_neon // 72 bl sum_\type\()_lag1_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_mid_neon // 48 bl sum_\type\()_lag2_mid_neon // 56 bl sum_\type\()_lag2_mid_neon // 64 bl sum_\type\()_lag2_mid_neon // 72 bl sum_\type\()_lag2_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 vpush {d26} bl generate_grain_rows_neon vpop {d26} mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_mid_neon // 48 bl sum_\type\()_lag3_mid_neon // 56 bl sum_\type\()_lag3_mid_neon // 64 bl sum_\type\()_lag3_mid_neon // 72 bl sum_\type\()_lag3_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) .else sub \reg, \reg, #6*32-GRAIN_WIDTH*2 .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_16bpc_neon, export=1 push {r4-r11,lr} ldr r4, [sp, #36] mov r12, r3 movw r11, #(3*GRAIN_WIDTH-3)*2 mov lr, #28 add r11, r1, r11 mov r1, r2 mul r12, r12, lr clz lr, r4 movrel r3, X(gaussian_sequence) sub lr, lr, #24 // -bitdepth_min_8 ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] add r4, r1, #FGD_AR_COEFFS_UV add r9, r9, lr // grain_scale_shift - bitdepth_min_8 adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 push {lr} cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 pop {lr} ldr r7, [r1, #FGD_AR_COEFF_SHIFT] neg lr, lr mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, uv_420 vpush {q4-q5} .endif mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 mvn r6, r5 // grain_min = ~grain_max mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type vdup.32 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vdup.16 q9, r5 vdup.16 q10, r6 vext.8 q13, q0, q1, #10 vext.8 q14, q1, q0, #14 vneg.s32 q12, q12 vmovl.s8 q11, d22 1: vmov q1, q13 bl gen_grain_\type\()_lag0_8_neon // 8 vmov.i8 q1, #255 bl gen_grain_\type\()_lag0_8_neon // 16 bl gen_grain_\type\()_lag0_8_neon // 24 bl gen_grain_\type\()_lag0_8_neon // 32 bl gen_grain_\type\()_lag0_8_neon // 40 vmov q1, q14 bl gen_grain_\type\()_lag0_4_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b .ifc \type, uv_420 vpop {q4-q5} .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] add r4, r4, #2 mov r1, #3 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon vmovl.s8 q13, d27 vmovl.s8 q12, d29 vmovl.s8 q14, d28 vmov d29, d24 vmovl.s8 q6, d13 set_height r1, \type 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off vmov.u16 r11, \src1[0+\off] vmov.u16 r12, \src3[0+\off] add r11, r11, r3 vmov.u16 lr, \src1[2+\off] add r12, r12, r3 vld1.8 {\dst1[0+\off]}, [r11] vmov.u16 r11, \src3[2+\off] add lr, lr, r3 vld1.8 {\dst2[0+\off]}, [r12] vmov.u16 r12, \src2[0+\off] add r11, r11, r3 vld1.8 {\dst1[2+\off]}, [lr] vmov.u16 lr, \src4[0+\off] add r12, r12, r3 vld1.8 {\dst2[2+\off]}, [r11] vmov.u16 r11, \src2[2+\off] add lr, lr, r3 vld1.8 {\dst1[4+\off]}, [r12] vmov.u16 r12, \src4[2+\off] add r11, r11, r3 vld1.8 {\dst2[4+\off]}, [lr] add r12, r12, r3 vld1.8 {\dst1[6+\off]}, [r11] vld1.8 {\dst2[6+\off]}, [r12] .endm .macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 .endm function gather32_neon push {r11-r12,lr} gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 pop {r11-r12,pc} endfunc function gather16_neon push {r11-r12,lr} gather_interleaved d8, d9, d0, d1, d2, d3, 0 gather_interleaved d8, d9, d0, d1, d2, d3, 1 pop {r11-r12,pc} endfunc const overlap_coeffs_0, align=4 .short 27, 17, 0, 0 .short 17, 27, 32, 32 endconst const overlap_coeffs_1, align=4 .short 23, 0, 0, 0 .short 22, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx, lsl #1 // grain_lut += offx .endm // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type, // const int bitdepth_max); function fgy_32x32_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut ldrd r6, r7, [sp, #108] // offsets, h ldr r8, [sp, #116] // clip mov r9, #GRAIN_WIDTH*2 // grain_lut stride ldr r10, [sp, #124] // bitdepth_max eor r4, r4, #15 // 15 - scaling_shift vdup.16 q6, r10 // bitdepth_max clz r10, r10 vdup.16 q13, r4 // 15 - scaling_shift rsb r10, r10, #24 // bitdepth_min_8 cmp r8, #0 vdup.16 q12, r10 // bitdepth_min_8 movrel_local r12, overlap_coeffs_0 beq 1f // clip vmov.i16 q14, #16 vmov.i16 q15, #235 vshl.s16 q14, q14, q12 vshl.s16 q15, q15, q12 b 2f 1: // no clip vmov.i16 q14, #0 vmov q15, q6 2: vshr.u16 q6, q6, #1 // grain_max vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs add r5, r5, #18 // grain_lut += 9 add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r9 // grain_lut += grain_stride ldr r10, [r6, #8] // offsets[1][0] calc_offset r10, r4, r10, 0, 0 add_offset r4, r10, r4, r5, r9 ldr r10, [r6, #4] // offsets[0][1] calc_offset r10, r11, r10, 0, 0 add_offset r11, r10, r11, r5, r9 ldr r10, [r6, #12] // offsets[1][1] calc_offset r10, r8, r10, 0, 0 add_offset r8, r10, r8, r5, r9 ldr r6, [r6] // offsets[0][0] calc_offset r6, lr, r6, 0, 0 add_offset r5, r6, lr, r5, r9 add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by ldr r10, [sp, #120] // type adr r11, L(fgy_loop_tbl) tst r10, #1 ldr r10, [r11, r10, lsl #2] add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx add r11, r11, r10 beq 1f // y overlap vdup.16 d14, d24[0] vdup.16 d15, d24[1] mov r10, r7 // backup actual h mov r7, #2 1: sub r2, r2, #32 // src_stride -= 32 sub r9, r9, #32 // grain_stride -= 32 bx r11 endfunc function fgy_loop_neon L(fgy_loop_tbl): .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB .macro fgy ox, oy L(loop_\ox\oy): 1: .if \ox vld1.16 {d0}, [r4], r9 // grain_lut old .endif .if \oy vld1.16 {q2, q3}, [r6]! // grain_lut top .endif .if \ox && \oy vld1.16 {d2}, [r8], r9 // grain_lut top old .endif .if \oy vld1.16 {q4, q5}, [r6], r9 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r1, :128]! // src .endif vld1.16 {q8, q9}, [r5]! // grain_lut .if !\ox && !\oy vld1.16 {q2, q3}, [r1, :128], r2 // src .endif .if !\oy vmvn.i16 q5, #0xf000 // 0x0fff .endif vld1.16 {q10, q11}, [r5], r9 // grain_lut .if \ox add r4, r4, #32 vmull.s16 q0, d0, d24 vmlal.s16 q0, d16, d25 .endif .if \oy .if \ox add r8, r8, #32 vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vmvn d0, d12 // grain_min vqrshrn.s32 d4, q1, #5 vmin.s16 d16, d16, d12 vmin.s16 d4, d4, d12 vmax.s16 d16, d16, d0 vmax.s16 d4, d4, d0 .endif vmull.s16 q0, d4, d14 vmull.s16 q1, d5, d14 vmull.s16 q2, d6, d14 vmull.s16 q3, d7, d14 vmlal.s16 q0, d16, d15 vmlal.s16 q1, d17, d15 vmlal.s16 q2, d18, d15 vmlal.s16 q3, d19, d15 vmull.s16 q8, d20, d15 vmull.s16 q9, d21, d15 vmull.s16 q10, d22, d15 vmull.s16 q11, d23, d15 vmlal.s16 q8, d8, d14 vmlal.s16 q9, d9, d14 vmlal.s16 q10, d10, d14 vmlal.s16 q11, d11, d14 vmvn q4, q6 // grain_min vqrshrn.s32 d0, q0, #5 vqrshrn.s32 d1, q1, #5 vqrshrn.s32 d2, q2, #5 vqrshrn.s32 d3, q3, #5 vqrshrn.s32 d4, q8, #5 vqrshrn.s32 d5, q9, #5 vqrshrn.s32 d6, q10, #5 vqrshrn.s32 d7, q11, #5 vmin.s16 q8, q0, q6 vmin.s16 q9, q1, q6 vld1.16 {q0, q1}, [r1, :128]! // src vmin.s16 q10, q2, q6 vmin.s16 q11, q3, q6 vmax.s16 q8, q8, q4 vmax.s16 q9, q9, q4 vld1.16 {q2, q3}, [r1, :128], r2 // src vmvn.i16 q5, #0xf000 // 0x0fff vmax.s16 q10, q10, q4 vmax.s16 q11, q11, q4 .elseif \ox vmvn d4, d12 // grain_min vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r1, :128]! // src vmin.s16 d16, d16, d12 vmax.s16 d16, d16, d4 vld1.16 {q2, q3}, [r1, :128], r2 // src .endif // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q5 vand q1, q1, q5 vand q2, q2, q5 vand q3, q3, q5 bl gather32_neon .if \ox || \oy vpush {q6-q7} .endif vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vshl.u16 q4, q4, q13 vshl.u16 q5, q5, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vqrdmulh.s16 q10, q10, q4 vqrdmulh.s16 q11, q11, q5 .if \ox || \oy vpop {q6-q7} .endif vqadd.s16 q0, q0, q8 // *src + noise vqadd.s16 q1, q1, q9 vqadd.s16 q2, q2, q10 vqadd.s16 q3, q3, q11 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {q0, q1}, [r0, :128]! // dst subs r7, r7, #1 .if \oy vdup.16 d14, d25[0] vdup.16 d15, d25[1] .endif vst1.16 {q2, q3}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r10, #2 sub r7, r10, #2 // restore actual remaining h bgt L(loop_\ox\()0) .endif vpop {q4-q7} pop {r4-r11,pc} .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 endfunc // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav1dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type, // const int bitdepth_max); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // data, grain_lut ldrd r10, r11, [sp, #124] // uv, is_id ldr r6, [sp, #136] // bitdepth_max clz r7, r6 rsb r7, r7, #24 // bitdepth_min_8 // !csfl add r10, r4, r10, lsl #2 // + 4*uv add r12, r10, #FGD_UV_LUMA_MULT add lr, r10, #FGD_UV_MULT ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset vld1.16 {d30[]}, [r12] // uv_luma_mult lsl r10, r10, r7 // uv_offset << bitdepth_min_8 vld1.16 {d30[1]}, [lr] // uv_mult ldr lr, [r4, #FGD_SCALING_SHIFT] ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] eor lr, lr, #15 // 15 - scaling_shift vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 cmp r12, #0 vdup.16 q13, lr // 15 - scaling_shift beq 1f // clip cmp r11, #0 mov r8, #16 mov r9, #240 lsl r8, r8, r7 lsl r9, r9, r7 beq 2f // is_id mov r9, #235 lsl r9, r9, r7 b 2f 1: // no clip mov r8, #0 mov r9, r6 // bitdepth_max 2: vmov.16 d30[3], r6 // bitdepth_max vdup.16 d31, r8 // clip_min mov r10, #GRAIN_WIDTH*2 // grain_lut stride .if \sy mov r6, #23 mov r7, #22 .else mov r6, #27 mov r7, #17 .endif vmov.16 d31[1], r9 // clip_max ldrd r8, r9, [sp, #116] // offsets, h add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 .if \sy add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride .else add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r10 // grain_lut += grain_stride .endif vmov.16 d31[2], r6 // overlap y [0] ldr r12, [r8, #8] // offsets[1][0] calc_offset r12, r4, r12, \sx, \sy add_offset r4, r12, r4, r5, r10 ldr r12, [r8, #4] // offsets[0][1] calc_offset r12, lr, r12, \sx, \sy add_offset lr, r12, lr, r5, r10 ldr r12, [r8, #12] // offsets[1][1] calc_offset r12, r11, r12, \sx, \sy add_offset r11, r12, r11, r5, r10 ldr r8, [r8] // offsets[0][0] calc_offset r8, r12, r8, \sx, \sy add_offset r5, r8, r12, r5, r10 vmov.16 d31[3], r7 // overlap y [1] add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx movrel_local r12, overlap_coeffs_\sx ldr lr, [sp, #132] // type ldrd r6, r7, [sp, #108] // luma_row, luma_stride vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs movrel_local r12, L(fguv_loop_sx\sx\()_tbl) #if CONFIG_THUMB // This uses movrel_local instead of adr above, because the target // can be out of range for adr. But movrel_local leaves the thumb bit // set on COFF (but probably wouldn't if building for thumb on ELF), // thus try to clear the bit for robustness. bic r12, r12, #1 #endif tst lr, #1 ldr lr, [r12, lr, lsl #2] add r12, r12, lr beq 1f // y overlap sub lr, r9, #(2 >> \sy) // backup remaining h mov r9, #(2 >> \sy) 1: .if \sy add r7, r7, r7 // luma_stride *= 2 .endif sub r7, r7, #32 // luma_stride -= 32 bx r12 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon L(fguv_loop_sx0_tbl): .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): sub r2, r2, #32 // src_stride -= 32 sub r10, r10, #32 // grain_stride -= 32 .if \oy mov r12, lr .endif L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): 1: .if \ox vld1.16 {d0}, [r4], r10 // grain_lut old .endif .if \oy vld1.16 {q2, q3}, [r8]! // grain_lut top .endif .if \ox && \oy vld1.16 {d2}, [r11], r10 // grain_lut top old .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r6, :128]! // luma .endif vld1.16 {q8, q9}, [r5]! // grain_lut .if \oy vld1.16 {q4, q5}, [r8], r10 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q2, q3}, [r6, :128], r7 // luma .endif .if \oy vdup.16 d28, d31[2] // overlap y coeff vdup.16 d29, d31[3] // overlap y coeff .endif vld1.16 {q10, q11}, [r5], r10 // grain_lut .if \ox vdup.16 q7, d30[3] // bitdepth_max add r4, r4, #32 vmull.s16 q0, d0, d24 vshr.u16 q7, q7, #1 // grain_max vmlal.s16 q0, d16, d25 vmvn q6, q7 // grain_min .endif .if \oy .if \ox add r11, r11, #32 vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d4, q1, #5 vmin.s16 d4, d4, d14 vmin.s16 d16, d16, d14 vmax.s16 d4, d4, d12 vmax.s16 d16, d16, d12 .endif vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 .if !\ox vdup.16 q7, d30[3] // bitdepth_max .endif vmlal.s16 q0, d16, d29 vmlal.s16 q1, d17, d29 vmlal.s16 q2, d18, d29 vmlal.s16 q3, d19, d29 .if !\ox vshr.u16 q7, q7, #1 // grain_max .endif vmull.s16 q8, d20, d29 vmull.s16 q9, d21, d29 vmull.s16 q10, d22, d29 vmull.s16 q11, d23, d29 .if !\ox vmvn q6, q7 // grain_min .endif vmlal.s16 q8, d8, d28 vmlal.s16 q9, d9, d28 vmlal.s16 q10, d10, d28 vmlal.s16 q11, d11, d28 vqrshrn.s32 d0, q0, #5 vqrshrn.s32 d1, q1, #5 vqrshrn.s32 d2, q2, #5 vqrshrn.s32 d3, q3, #5 vqrshrn.s32 d4, q8, #5 vqrshrn.s32 d5, q9, #5 vqrshrn.s32 d6, q10, #5 vqrshrn.s32 d7, q11, #5 vmin.s16 q8, q0, q7 vmin.s16 q9, q1, q7 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 q10, q2, q7 vmin.s16 q11, q3, q7 vmax.s16 q8, q8, q6 vmax.s16 q9, q9, q6 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 q10, q10, q6 vmax.s16 q11, q11, q6 .elseif \ox vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 d16, d16, d14 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 d16, d16, d12 .endif .if !\csfl vdup.16 d28, d30[0] // uv_luma_mult vld1.16 {q4, q5}, [r1, :128]! // src vdup.16 d29, d30[1] // uv_mult vmull.s16 q6, d0, d28 vmull.s16 q7, d1, d28 vmull.s16 q0, d2, d28 vmull.s16 q1, d3, d28 vmlal.s16 q6, d8, d29 vmlal.s16 q7, d9, d29 vmlal.s16 q0, d10, d29 vmlal.s16 q1, d11, d29 vld1.16 {q4, q5}, [r1, :128] // src sub r1, r1, #32 vshrn.s32 d12, q6, #6 vshrn.s32 d13, q7, #6 vshrn.s32 d14, q0, #6 vshrn.s32 d15, q1, #6 vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 vmlal.s16 q0, d8, d29 vmlal.s16 q1, d9, d29 vmlal.s16 q2, d10, d29 vmlal.s16 q3, d11, d29 vdup.16 q14, d30[2] // uv_offset vshrn.s32 d0, q0, #6 vshrn.s32 d1, q1, #6 vshrn.s32 d2, q2, #6 vshrn.s32 d3, q3, #6 vdup.16 q4, d30[3] // bitdepth_max vmov.i16 q5, #0 vadd.i16 q6, q6, q14 vadd.i16 q7, q7, q14 vadd.i16 q2, q0, q14 vadd.i16 q3, q1, q14 vmin.s16 q0, q6, q4 vmin.s16 q1, q7, q4 vmin.s16 q2, q2, q4 vmin.s16 q3, q3, q4 vmax.s16 q0, q0, q5 vmax.s16 q1, q1, q5 vmax.s16 q2, q2, q5 vmax.s16 q3, q3, q5 .else vdup.16 q14, d30[3] // bitdepth_max // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q14 vand q1, q1, q14 vand q2, q2, q14 vand q3, q3, q14 .endif bl gather32_neon vld1.16 {q0, q1}, [r1, :128]! // src vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vld1.16 {q2, q3}, [r1, :128], r2 // src vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vshl.u16 q4, q4, q13 vshl.u16 q5, q5, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vqrdmulh.s16 q10, q10, q4 vqrdmulh.s16 q11, q11, q5 vdup.16 q4, d31[0] // clip_min vdup.16 q5, d31[1] // clip_max vqadd.s16 q0, q0, q8 // *src + noise vqadd.s16 q1, q1, q9 vqadd.s16 q2, q2, q10 vqadd.s16 q3, q3, q11 .if \oy vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x .endif vmax.s16 q0, q0, q4 vmax.s16 q1, q1, q4 vmax.s16 q2, q2, q4 vmax.s16 q3, q3, q4 vmin.s16 q0, q0, q5 vmin.s16 q1, q1, q5 vmin.s16 q2, q2, q5 vmin.s16 q3, q3, q5 vst1.16 {q0, q1}, [r0, :128]! // dst subs r9, r9, #1 .if \oy vmov.32 d31[1], lr // new coeffs for overlap y .endif vst1.16 {q2, q3}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc function fguv_loop_sx1_neon L(fguv_loop_sx1_tbl): .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): .if \oy mov r12, lr .endif 1: .if \ox vld1.16 {d0}, [r4], r10 // grain_lut old .endif .if \ox && \oy vld1.16 {d2}, [r11], r10 // grain_lut top old .endif .if \oy vld1.16 {q2, q3}, [r8], r10 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r6, :128]! // luma .endif vld1.16 {q8, q9}, [r5], r10 // grain_lut .if \oy vdup.16 d28, d31[2] // overlap y coeff vdup.16 d29, d31[3] // overlap y coeff .endif .if !\ox && !\oy vld1.16 {q2, q3}, [r6, :128], r7 // luma .endif .if \ox vdup.16 q7, d30[3] // bitdepth_max vmull.s16 q0, d0, d24 vshr.u16 q7, q7, #1 // grain_max vmlal.s16 q0, d16, d25 vmvn q6, q7 // grain_min .endif .if \oy .if \ox vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d4, q1, #5 vmin.s16 d4, d4, d14 vmin.s16 d16, d16, d14 vmax.s16 d4, d4, d12 vmax.s16 d16, d16, d12 .endif vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 .if !\ox vdup.16 q7, d30[3] // bitdepth_max .endif vmlal.s16 q0, d16, d29 vmlal.s16 q1, d17, d29 vmlal.s16 q2, d18, d29 vmlal.s16 q3, d19, d29 .if !\ox vshr.u16 q7, q7, #1 // grain_max .endif vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d17, q1, #5 vqrshrn.s32 d18, q2, #5 vqrshrn.s32 d19, q3, #5 .if !\ox vmvn q6, q7 // grain_min .endif vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 q8, q8, q7 vmin.s16 q9, q9, q7 vmax.s16 q8, q8, q6 vmax.s16 q9, q9, q6 vld1.16 {q2, q3}, [r6, :128], r7 // luma .elseif \ox vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 d16, d16, d14 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 d16, d16, d12 .endif vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vrshr.u16 q0, q0, #1 vrshr.u16 q1, q1, #1 .if !\csfl vdup.16 d28, d30[0] // uv_luma_mult vld1.16 {q2, q3}, [r1, :128], r2 // src vdup.16 d29, d30[1] // uv_mult vmull.s16 q6, d0, d28 vmull.s16 q7, d1, d28 vmull.s16 q0, d2, d28 vmull.s16 q1, d3, d28 vmlal.s16 q6, d4, d29 vmlal.s16 q7, d5, d29 vmlal.s16 q0, d6, d29 vmlal.s16 q1, d7, d29 vshrn.s32 d12, q6, #6 vshrn.s32 d13, q7, #6 vshrn.s32 d14, q0, #6 vshrn.s32 d15, q1, #6 vdup.16 q14, d30[2] // uv_offset vdup.16 q4, d30[3] // bitdepth_max vmov.i16 q5, #0 vadd.i16 q6, q6, q14 vadd.i16 q7, q7, q14 vmin.s16 q0, q6, q4 vmin.s16 q1, q7, q4 vmax.s16 q0, q0, q5 vmax.s16 q1, q1, q5 .else vdup.16 q14, d30[3] // bitdepth_max vld1.16 {q2, q3}, [r1, :128], r2 // src // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q14 vand q1, q1, q14 .endif bl gather16_neon vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vdup.16 q4, d31[0] // clip_min vdup.16 q5, d31[1] // clip_max vqadd.s16 q0, q2, q8 // *src + noise vqadd.s16 q1, q3, q9 .if \oy // Swap the two last coefficients of d31, place them first in d28 vrev64.16 d28, d31 .endif vmax.s16 q0, q0, q4 vmax.s16 q1, q1, q4 vmin.s16 q0, q0, q5 vmin.s16 q1, q1, q5 subs r9, r9, #1 .if \oy // Take the first two 16 bit coefficients of d28 and place them at the // end of d31 vtrn.32 d31, d28 .endif vst1.16 {q0, q1}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc