diff options
Diffstat (limited to 'third_party/dav1d/src/arm/32/looprestoration_common.S')
-rw-r--r-- | third_party/dav1d/src/arm/32/looprestoration_common.S | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/looprestoration_common.S b/third_party/dav1d/src/arm/32/looprestoration_common.S new file mode 100644 index 0000000000..b080bb5115 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration_common.S @@ -0,0 +1,453 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #2 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 1f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Sum all h+2 lines with the main loop + add lr, lr, #2 +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q8-q13 and q0-q2 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q8, q9}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q1}, [r6, :128], r8 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q10, q8 + vmov q11, q9 + vmov q1, q0 + vmov q12, q8 + vmov q13, q9 + vmov q2, q0 + +3: + subs r3, r3, #1 +.macro add3 + vadd.i32 q8, q8, q10 + vadd.i32 q9, q9, q11 + vadd.i16 q0, q0, q1 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i16 q0, q0, q2 + vst1.32 {q8, q9}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + vmov q10, q12 + vmov q11, q13 + vmov q1, q2 + ble 4f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3b + +4: + tst r4, #8 // LR_HAVE_BOTTOM + bne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + add3 + +5: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + pop {r4-r9,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + push {r4-r9,lr} + vpush {q5-q7} + ldr r4, [sp, #76] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #8 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 0f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Handle h+2 lines with the main loop + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub r3, r3, #1 // Handle h-1 lines with the main loop +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q6-q15 and q0-q3,q5 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q6, q7}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vmov q10, q6 + vmov q11, q7 + vmov q2, q0 + vmov q12, q6 + vmov q13, q7 + vmov q3, q0 + +3: + cmp r3, #0 + beq 4f + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + +3: + // Start of vertical loop + subs r3, r3, #2 +.macro add5 + vadd.i32 q6, q6, q8 + vadd.i32 q7, q7, q9 + vadd.i16 q0, q0, q1 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i16 q0, q0, q2 + vadd.i32 q6, q6, q12 + vadd.i32 q7, q7, q13 + vadd.i16 q0, q0, q3 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q15 + vadd.i16 q0, q0, q5 + vst1.32 {q6, q7}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add5 +.macro shift2 + vmov q6, q10 + vmov q7, q11 + vmov q0, q2 + vmov q8, q12 + vmov q9, q13 + vmov q1, q3 + vmov q10, q14 + vmov q11, q15 + vmov q2, q5 +.endm + shift2 + add r0, r0, r7 + add r1, r1, r8 + ble 5f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + add5 + b 6f + +5: + tst r4, #8 // LR_HAVE_BOTTOM + bne 6f + // !LR_HAVE_BOTTOM + cmp r3, #0 + bne 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + // Pad the past-edge row from the last content row. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // r3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + vmov q12, q10 + vmov q13, q11 + vmov q3, q2 + vmov q14, q10 + vmov q15, q11 + vmov q5, q2 + add5 + add r0, r0, r7 + add r1, r1, r8 + b 6f + +6: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + vpop {q5-q7} + pop {r4-r9,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #2 // h += 2 + clz r6, r5 + vmov.i32 q15, #9 // n + movw r5, #455 + mov lr, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #3 // h += 3 + clz r6, r5 + asr r3, r3, #1 // h /= 2 + vmov.i32 q15, #25 // n + mov r5, #164 + mov lr, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel r12, X(sgr_x_by_x) + sub r6, r6, #24 // -bitdepth_min_8 + vld1.8 {q8, q9}, [r12, :128]! + add r7, r6, r6 // -2*bitdepth_min_8 + vmov.i8 q11, #5 + vmov.i8 d10, #55 // idx of last 5 + vld1.8 {q10}, [r12, :128] + vmov.i8 d11, #72 // idx of last 4 + vmov.i8 d12, #101 // idx of last 3 + vmov.i8 d13, #169 // idx of last 2 + vmov.i8 d14, #254 // idx of last 1 + vmov.i8 d15, #32 // elements consumed in first vtbl + add r2, r2, #2 // w += 2 + add r12, r2, #7 + bic r12, r12, #7 // aligned w + sub r12, lr, r12 // increment between rows + vdup.32 q12, r4 + sub r0, r0, #(4*(SUM_STRIDE)) + sub r1, r1, #(2*(SUM_STRIDE)) + mov r4, r2 // backup of w + vsub.i8 q8, q8, q11 + vsub.i8 q9, q9, q11 + vsub.i8 q10, q10, q11 +1: + vld1.32 {q0, q1}, [r0, :128] // a + vld1.16 {q2}, [r1, :128] // b + vdup.32 q13, r7 // -2*bitdepth_min_8 + vdup.16 q14, r6 // -bitdepth_min_8 + subs r2, r2, #8 + vrshl.s32 q0, q0, q13 + vrshl.s32 q1, q1, q13 + vrshl.s16 q4, q2, q14 + vmul.i32 q0, q0, q15 // a * n + vmul.i32 q1, q1, q15 // a * n + vmull.u16 q3, d8, d8 // b * b + vmull.u16 q4, d9, d9 // b * b + vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) + vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) + vmul.i32 q0, q0, q12 // p * s + vmul.i32 q1, q1, q12 // p * s + vqshrn.u32 d0, q0, #16 + vqshrn.u32 d1, q1, #16 + vqrshrn.u16 d0, q0, #4 // imin(z, 255) + + vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 + vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 + vtbl.8 d1, {q8, q9}, d0 + vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 + vsub.i8 d9, d0, d15 // indices for vtbx + vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 + vadd.i8 d2, d2, d3 + vtbx.8 d1, {q10}, d9 + vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 + vadd.i8 d6, d6, d7 + vadd.i8 d8, d8, d22 + vadd.i8 d2, d2, d6 + vadd.i8 d1, d1, d8 + vadd.i8 d1, d1, d2 + vmovl.u8 q0, d1 // x + + vmov.i16 q13, #256 + vdup.32 q14, r5 // one_by_x + + vmull.u16 q1, d0, d4 // x * BB[i] + vmull.u16 q2, d1, d5 // x * BB[i] + vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x + vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x + vrshr.s32 q1, q1, #12 // AA[i] + vrshr.s32 q2, q2, #12 // AA[i] + vsub.i16 q0, q13, q0 // 256 - x + + vst1.32 {q1, q2}, [r0, :128]! + vst1.16 {q0}, [r1, :128]! + bgt 1b + + subs r3, r3, #1 + ble 0f + add r0, r0, r12, lsl #2 + add r1, r1, r12, lsl #1 + mov r2, r4 + b 1b +0: + vpop {q4-q7} + pop {r4-r7,pc} +endfunc |