diff options
Diffstat (limited to 'third_party/dav1d/src/arm/64/looprestoration16.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/looprestoration16.S | 1419 |
1 files changed, 1419 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S new file mode 100644 index 0000000000..8954e604cf --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -0,0 +1,1419 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter7_16bpc_neon, export=1 + ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + dup v28.8h, w8 // bitdepth_max + clz w8, w8 + movi v30.4s, #1 + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w8 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x1 // lpf += stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + mov x13, x14 // t2 + subs w5, w5, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_7) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter7_hv_16bpc_neon + bl wiener_filter7_hv_16bpc_neon +L(v1_7): + bl wiener_filter7_v_16bpc_neon + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_7): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_16bpc_neon +L(v2_7): + bl wiener_filter7_v_16bpc_neon + b L(v1_7) +endfunc + + +function wiener_filter7_h_16bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v16.4s, v18.4h, v0.h[3] + smlal v16.4s, v19.4h, v0.h[2] + smlal v16.4s, v20.4h, v0.h[1] + smlal v16.4s, v21.4h, v0.h[0] + smull2 v17.4s, v18.8h, v0.h[3] + smlal2 v17.4s, v19.8h, v0.h[2] + smlal2 v17.4s, v20.8h, v0.h[1] + smlal2 v17.4s, v21.8h, v0.h[0] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter7_v_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x4, [sp, #48] +1: + ld1 {v16.8h, v17.8h}, [x9], #32 + ld1 {v18.8h, v19.8h}, [x10], #32 + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v22.8h, v23.8h}, [x12], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + ld1 {v6.8h, v7.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[4] + smlal v2.4s, v18.4h, v0.h[5] + smlal v2.4s, v20.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[7] + smlal v2.4s, v24.4h, v0.h[6] + smlal v2.4s, v6.4h, v0.h[5] + smlal v2.4s, v6.4h, v0.h[4] + smull2 v3.4s, v16.8h, v0.h[4] + smlal2 v3.4s, v18.8h, v0.h[5] + smlal2 v3.4s, v20.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[7] + smlal2 v3.4s, v24.8h, v0.h[6] + smlal2 v3.4s, v6.8h, v0.h[5] + smlal2 v3.4s, v6.8h, v0.h[4] + smull v4.4s, v17.4h, v0.h[4] + smlal v4.4s, v19.4h, v0.h[5] + smlal v4.4s, v21.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[7] + smlal v4.4s, v25.4h, v0.h[6] + smlal v4.4s, v7.4h, v0.h[5] + smlal v4.4s, v7.4h, v0.h[4] + smull2 v5.4s, v17.8h, v0.h[4] + smlal2 v5.4s, v19.8h, v0.h[5] + smlal2 v5.4s, v21.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[7] + smlal2 v5.4s, v25.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + smlal2 v5.4s, v7.8h, v0.h[4] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + subs w4, w4, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x4, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x4, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v24.4s, v18.4h, v0.h[3] + smlal v24.4s, v19.4h, v0.h[2] + smlal v24.4s, v20.4h, v0.h[1] + smlal v24.4s, v21.4h, v0.h[0] + smull2 v25.4s, v18.8h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[2] + smlal2 v25.4s, v20.8h, v0.h[1] + smlal2 v25.4s, v21.8h, v0.h[0] + + ld1 {v16.8h, v17.8h}, [x9], #32 + + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x10], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x11], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x12], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + ld1 {v24.8h, v25.8h}, [x13], #32 + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + ld1 {v8.8h, v9.8h}, [x14], #32 + + smull v1.4s, v16.4h, v0.h[4] + smlal v1.4s, v18.4h, v0.h[5] + smlal v1.4s, v20.4h, v0.h[6] + smlal v1.4s, v22.4h, v0.h[7] + smlal v1.4s, v24.4h, v0.h[6] + smlal v1.4s, v8.4h, v0.h[5] + smlal v1.4s, v6.4h, v0.h[4] + smull2 v5.4s, v16.8h, v0.h[4] + smlal2 v5.4s, v18.8h, v0.h[5] + smlal2 v5.4s, v20.8h, v0.h[6] + smlal2 v5.4s, v22.8h, v0.h[7] + smlal2 v5.4s, v24.8h, v0.h[6] + smlal2 v5.4s, v8.8h, v0.h[5] + smlal2 v5.4s, v6.8h, v0.h[4] + smull v26.4s, v17.4h, v0.h[4] + smlal v26.4s, v19.4h, v0.h[5] + smlal v26.4s, v21.4h, v0.h[6] + smlal v26.4s, v23.4h, v0.h[7] + smlal v26.4s, v25.4h, v0.h[6] + smlal v26.4s, v9.4h, v0.h[5] + smlal v26.4s, v7.4h, v0.h[4] + smull2 v16.4s, v17.8h, v0.h[4] + smlal2 v16.4s, v19.8h, v0.h[5] + smlal2 v16.4s, v21.8h, v0.h[6] + smlal2 v16.4s, v23.8h, v0.h[7] + smlal2 v16.4s, v25.8h, v0.h[6] + smlal2 v16.4s, v9.8h, v0.h[5] + smlal2 v16.4s, v7.8h, v0.h[4] + srshl v1.4s, v1.4s, v27.4s // -round_bits_v + srshl v5.4s, v5.4s, v27.4s + srshl v26.4s, v26.4s, v27.4s + srshl v16.4s, v16.4s, v27.4s + sqxtun v18.4h, v1.4s + sqxtun2 v18.8h, v5.4s + sqxtun v19.4h, v26.4s + sqxtun2 v19.8h, v16.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v18.8h, v18.8h, v28.8h // bitdepth_max + umin v19.8h, v19.8h, v28.8h + subs w4, w4, #16 + + st1 {v18.8h, v19.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter5_16bpc_neon, export=1 + ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + dup v28.8h, w8 // bitdepth_max + clz w8, w8 + movi v30.4s, #1 + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w8 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x1 // lpf += stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_5) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter5_hv_16bpc_neon + bl wiener_filter5_hv_16bpc_neon +L(end_5): + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_5): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_16bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_16bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_16bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v16.4s, v17.4h, v0.h[3] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[1] + smull2 v17.4s, v17.8h, v0.h[3] + smlal2 v17.4s, v18.8h, v0.h[2] + smlal2 v17.4s, v19.8h, v0.h[1] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter5_v_16bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x4, [sp, #32] +1: + ld1 {v16.8h, v17.8h}, [x11], #32 + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v20.8h, v21.8h}, [x13], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[5] + smlal v2.4s, v18.4h, v0.h[6] + smlal v2.4s, v20.4h, v0.h[7] + smlal v2.4s, v22.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[5] + smull2 v3.4s, v16.8h, v0.h[5] + smlal2 v3.4s, v18.8h, v0.h[6] + smlal2 v3.4s, v20.8h, v0.h[7] + smlal2 v3.4s, v22.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[5] + smull v4.4s, v17.4h, v0.h[5] + smlal v4.4s, v19.4h, v0.h[6] + smlal v4.4s, v21.4h, v0.h[7] + smlal v4.4s, v23.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[5] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + + subs w4, w4, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x4, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + + ret +endfunc + +function wiener_filter5_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x4, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 2x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v24.4s, v17.4h, v0.h[3] + smlal v24.4s, v18.4h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[1] + smull2 v25.4s, v17.8h, v0.h[3] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal2 v25.4s, v19.8h, v0.h[1] + + ld1 {v16.8h, v17.8h}, [x11], #32 + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x12], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x13], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x14], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + smull v8.4s, v16.4h, v0.h[5] + smlal v8.4s, v18.4h, v0.h[6] + smlal v8.4s, v20.4h, v0.h[7] + smlal v8.4s, v22.4h, v0.h[6] + smlal v8.4s, v6.4h, v0.h[5] + smull2 v9.4s, v16.8h, v0.h[5] + smlal2 v9.4s, v18.8h, v0.h[6] + smlal2 v9.4s, v20.8h, v0.h[7] + smlal2 v9.4s, v22.8h, v0.h[6] + smlal2 v9.4s, v6.8h, v0.h[5] + smull v1.4s, v17.4h, v0.h[5] + smlal v1.4s, v19.4h, v0.h[6] + smlal v1.4s, v21.4h, v0.h[7] + smlal v1.4s, v23.4h, v0.h[6] + smlal v1.4s, v7.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + srshl v8.4s, v8.4s, v27.4s // -round_bits_v + srshl v9.4s, v9.4s, v27.4s + srshl v1.4s, v1.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v8.4h, v8.4s + sqxtun2 v8.8h, v9.4s + sqxtun v9.4h, v1.4s + sqxtun2 v9.8h, v5.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v8.8h, v8.8h, v28.8h // bitdepth_max + umin v9.8h, v9.8h, v28.8h + + subs w4, w4, #16 + + st1 {v8.8h, v9.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + sub x12, x12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #4 + + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + sub x12, x12, #4 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 2x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + sub x12, x12, #4 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b + +4: // Loop horizontally + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + subs w5, w5, #8 + + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + sub x12, x12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #6 + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + sub x12, x12, #6 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + sub x12, x12, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -2 + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b + +4: // Loop horizontally + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + ext v26.16b, v0.16b, v1.16b, #6 + ext v28.16b, v16.16b, v17.16b, #6 + ext v27.16b, v0.16b, v1.16b, #8 + ext v29.16b, v16.16b, v17.16b, #8 + + add v6.8h, v6.8h, v26.8h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v7.8h, v28.8h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + subs w5, w5, #8 + + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +sgr_funcs 16 |