diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/dav1d/src/arm/64/looprestoration_common.S | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S new file mode 100644 index 0000000000..200eb63189 --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration_common.S @@ -0,0 +1,432 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #2 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 1f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Sum all h+2 lines with the main loop + add w11, w11, #2 +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v21 and v24-v26 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v24.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.4s, v19.4s}, [x5], x7 + ld1 {v25.8h}, [x6], x8 + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v25.16b, v24.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v26.16b, v24.16b + +3: + subs w3, w3, #1 +.macro add3 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v24.8h, v24.8h, v25.8h + add v16.4s, v16.4s, v20.4s + add v17.4s, v17.4s, v21.4s + add v24.8h, v24.8h, v26.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v24.8h}, [x1], x8 +.endm + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v25.16b, v26.16b + b.le 4f + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3b + +4: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + add3 + +5: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #8 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 0f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Handle h+2 lines with the main loop + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub w3, w3, #1 // Handle h-1 lines with the main loop +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v25 and v26-v30 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v28.8h}, [x6], x8 + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v28.16b, v26.16b + mov v22.16b, v16.16b + mov v23.16b, v17.16b + mov v29.16b, v26.16b + +3: + cbz w3, 4f + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + +3: + // Start of vertical loop + subs w3, w3, #2 +.macro add5 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v26.8h, v26.8h, v27.8h + add v0.4s, v20.4s, v22.4s + add v1.4s, v21.4s, v23.4s + add v2.8h, v28.8h, v29.8h + add v16.4s, v16.4s, v24.4s + add v17.4s, v17.4s, v25.4s + add v26.8h, v26.8h, v30.8h + add v16.4s, v16.4s, v0.4s + add v17.4s, v17.4s, v1.4s + add v26.8h, v26.8h, v2.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v26.8h}, [x1], x8 +.endm + add5 +.macro shift2 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v26.16b, v28.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v27.16b, v29.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v28.16b, v30.16b +.endm + shift2 + add x0, x0, x7 + add x1, x1, x8 + b.le 5f + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + add5 + b 6f + +5: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 6f + // !LR_HAVE_BOTTOM + cbnz w3, 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + // Pad the past-edge row from the last content row. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // w3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v29.16b, v28.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v30.16b, v28.16b + add5 + add x0, x0, x7 + add x1, x1, x8 + b 6f + +6: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + clz w9, w5 + add x3, x3, #2 // h += 2 + movi v31.4s, #9 // n + mov x5, #455 + mov x8, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + clz w9, w5 + add x3, x3, #3 // h += 3 + asr x3, x3, #1 // h /= 2 + movi v31.4s, #25 // n + mov x5, #164 + mov x8, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + sub w9, w9, #24 // -bitdepth_min_8 + movrel x12, X(sgr_x_by_x) + ld1 {v16.16b, v17.16b, v18.16b}, [x12] + dup v6.8h, w9 // -bitdepth_min_8 + movi v19.16b, #5 + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + movi v24.8b, #254 // idx of last 1 + saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 + add x2, x2, #2 // w += 2 + add x7, x2, #7 + bic x7, x7, #7 // aligned w + sub x7, x8, x7 // increment between rows + movi v29.8h, #1, lsl #8 + dup v28.4s, w4 + dup v30.4s, w5 // one_by_x + sub x0, x0, #(4*(SUM_STRIDE)) + sub x1, x1, #(2*(SUM_STRIDE)) + mov x6, x2 // backup of w + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b +1: + subs x2, x2, #8 + ld1 {v0.4s, v1.4s}, [x0] // a + ld1 {v2.8h}, [x1] // b + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v1.8b, v1.8b, v5.8b + add v1.8b, v1.8b, v25.8b + uxtl v1.8h, v1.8b // x + + umull v3.4s, v1.4h, v2.4h // x * BB[i] + umull2 v4.4s, v1.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v2.8h, v29.8h, v1.8h // 256 - x + + st1 {v3.4s, v4.4s}, [x0], #32 + st1 {v2.8h}, [x1], #16 + b.gt 1b + + subs x3, x3, #1 + b.le 0f + add x0, x0, x7, lsl #2 + add x1, x1, x7, lsl #1 + mov x2, x6 + b 1b +0: + ret +endfunc |