diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/dav1d/src/arm/64/looprestoration16.S | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/arm/64/looprestoration16.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/looprestoration16.S | 1239 |
1 files changed, 1239 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S new file mode 100644 index 0000000000..437988cfac --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -0,0 +1,1239 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + ldr w8, [sp] // bitdepth_max + ld1 {v0.8h}, [x4] + clz w8, w8 + movi v30.4s, #1 + sub w9, w8, #38 // -(bitdepth + 6) + sub w8, w8, #25 // -round_bits_h + neg w9, w9 // bitdepth + 6 + dup v1.4s, w9 + dup v29.4s, w8 // -round_bits_h + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v1.4s // 1 << (bitdepth + 6) + mov w8, w5 + // Calculate mid_stride + add w10, w5, #7 + bic w10, w10, #7 + lsl w10, w10, #1 + + // Clear the last unused element of v0, to allow filtering a single + // pixel with one plain mul+addv. + ins v0.h[7], wzr + + // Set up pointers for reading/writing alternate rows + add x12, x0, x10 + lsl w10, w10, #1 + add x13, x2, x3 + lsl x3, x3, #1 + + // Subtract the width from mid_stride + sub x10, x10, w5, uxtw #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. + cmp w5, #8 + add w11, w5, #13 + bic w11, w11, #7 + b.ge 1f + mov w11, #16 +1: + sub x3, x3, w11, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x1, 0f + // left == NULL + sub x2, x2, #6 + sub x13, x13, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x3, x3, #6 + + +1: // Loop vertically + ld1 {v2.8h, v3.8h}, [x2], #32 + ld1 {v4.8h, v5.8h}, [x13], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x1, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.d}[1], [x1], #8 + // Move x2/x13 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x2, x2, #6 + sub x13, x13, #6 + ld1 {v6.d}[1], [x1], #8 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v1.16b, v2.16b, #10 + ext v5.16b, v4.16b, v5.16b, #10 + ext v4.16b, v6.16b, v4.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost pixel + // and shift v2/v3 to have 3x the first pixel at the front. + dup v1.8h, v2.h[0] + dup v6.8h, v4.h[0] + // Move x2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x2, x2, #6 + sub x13, x13, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v1.16b, v2.16b, #10 + ext v5.16b, v4.16b, v5.16b, #10 + ext v4.16b, v6.16b, v4.16b, #10 + +2: + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w9, w5, #14 + ldr h27, [x2, w9, sxtw #1] + ldr h28, [x13, w9, sxtw #1] + // Fill v27/v28 with the right padding pixel + dup v27.8h, v27.h[0] + dup v28.8h, v28.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro ushll_sz d0, d1, src, shift, wd + ushll \d0\().4s, \src\().4h, \shift +.ifc \wd, .8h + ushll2 \d1\().4s, \src\().8h, \shift +.endif +.endm +.macro add_sz d0, d1, s0, s1, c, wd + add \d0\().4s, \s0\().4s, \c\().4s +.ifc \wd, .8h + add \d1\().4s, \s1\().4s, \c\().4s +.endif +.endm +.macro srshl_sz d0, d1, s0, s1, c, wd + srshl \d0\().4s, \s0\().4s, \c\().4s +.ifc \wd, .8h + srshl \d1\().4s, \s1\().4s, \c\().4s +.endif +.endm +.macro sqxtun_sz dst, s0, s1, wd + sqxtun \dst\().4h, \s0\().4s +.ifc \wd, .8h + sqxtun2 \dst\().8h, \s1\().4s +.endif +.endm + +.macro filter wd + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v18.16b, v2.16b, v3.16b, #6 + ext v16.16b, v2.16b, v3.16b, #2 + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v20.16b, v2.16b, v3.16b, #10 + ushll_sz v6, v7, v18, #7, \wd + ext v21.16b, v2.16b, v3.16b, #12 + smlal v6.4s, v2.4h, v0.h[0] + smlal v6.4s, v16.4h, v0.h[1] + smlal v6.4s, v17.4h, v0.h[2] + smlal v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[4] + smlal v6.4s, v20.4h, v0.h[5] + smlal v6.4s, v21.4h, v0.h[6] +.ifc \wd, .8h + smlal2 v7.4s, v2.8h, v0.h[0] + smlal2 v7.4s, v16.8h, v0.h[1] + smlal2 v7.4s, v17.8h, v0.h[2] + smlal2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[4] + smlal2 v7.4s, v20.8h, v0.h[5] + smlal2 v7.4s, v21.8h, v0.h[6] +.endif + ext v21.16b, v4.16b, v5.16b, #6 + ext v19.16b, v4.16b, v5.16b, #2 + ext v20.16b, v4.16b, v5.16b, #4 + ext v22.16b, v4.16b, v5.16b, #8 + ext v23.16b, v4.16b, v5.16b, #10 + ushll_sz v16, v17, v21, #7, \wd + ext v24.16b, v4.16b, v5.16b, #12 + smlal v16.4s, v4.4h, v0.h[0] + smlal v16.4s, v19.4h, v0.h[1] + smlal v16.4s, v20.4h, v0.h[2] + smlal v16.4s, v21.4h, v0.h[3] + smlal v16.4s, v22.4h, v0.h[4] + smlal v16.4s, v23.4h, v0.h[5] + smlal v16.4s, v24.4h, v0.h[6] +.ifc \wd, .8h + smlal2 v17.4s, v4.8h, v0.h[0] + smlal2 v17.4s, v19.8h, v0.h[1] + smlal2 v17.4s, v20.8h, v0.h[2] + smlal2 v17.4s, v21.8h, v0.h[3] + smlal2 v17.4s, v22.8h, v0.h[4] + smlal2 v17.4s, v23.8h, v0.h[5] + smlal2 v17.4s, v24.8h, v0.h[6] +.endif + mvni v24\wd, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add_sz v6, v7, v6, v7, v30, \wd + add_sz v16, v17, v16, v17, v30, \wd + srshl_sz v6, v7, v6, v7, v29, \wd + srshl_sz v16, v17, v16, v17, v29, \wd + sqxtun_sz v6, v6, v7, \wd + sqxtun_sz v7, v16, v17, \wd + umin v6\wd, v6\wd, v24\wd + umin v7\wd, v7\wd, v24\wd + sub v6\wd, v6\wd, v31\wd + sub v7\wd, v7\wd, v31\wd +.endm + filter .8h + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x12], #16 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v2.16b, v3.16b + mov v4.16b, v5.16b + ld1 {v3.8h}, [x2], #16 + ld1 {v5.8h}, [x13], #16 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Filter 4 pixels, 7 <= w < 11 + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + + subs w5, w5, #4 // 3 <= w < 7 + ext v2.16b, v2.16b, v3.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + +6: // Pad the right edge and filter the last few pixels. + // w < 7, w+3 pixels valid in v2-v3 + cmp w5, #5 + b.lt 7f + b.gt 8f + // w == 5, 8 pixels valid in v2, v3 invalid + mov v3.16b, v27.16b + mov v5.16b, v28.16b + b 88f + +7: // 1 <= w < 5, 4-7 pixels valid in v2 + sub w9, w5, #1 + // w9 = (pixels valid - 4) + adr x11, L(variable_shift_tbl) + ldrh w9, [x11, w9, uxtw #1] + sub x11, x11, w9, uxth + mov v3.16b, v27.16b + mov v5.16b, v28.16b + br x11 +44: // 4 pixels valid in v2/v4, fill the high half with padding. + ins v2.d[1], v3.d[0] + ins v4.d[1], v5.d[0] + b 88f + // Shift v2 right, shifting out invalid pixels, + // shift v2 left to the original offset, shifting in padding pixels. +55: // 5 pixels valid + ext v2.16b, v2.16b, v2.16b, #10 + ext v2.16b, v2.16b, v3.16b, #6 + ext v4.16b, v4.16b, v4.16b, #10 + ext v4.16b, v4.16b, v5.16b, #6 + b 88f +66: // 6 pixels valid, fill the upper 2 pixels with padding. + ins v2.s[3], v3.s[0] + ins v4.s[3], v5.s[0] + b 88f +77: // 7 pixels valid, fill the last pixel with padding. + ins v2.h[7], v3.h[0] + ins v4.h[7], v5.h[0] + b 88f + +L(variable_shift_tbl): + .hword L(variable_shift_tbl) - 44b + .hword L(variable_shift_tbl) - 55b + .hword L(variable_shift_tbl) - 66b + .hword L(variable_shift_tbl) - 77b + +8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 + ins v27.h[0], v3.h[0] + ins v28.h[0], v5.h[0] + mov v3.16b, v27.16b + mov v5.16b, v28.16b + +88: + // w < 7, v2-v3 padded properly + cmp w5, #4 + b.lt 888f + + // w >= 4, filter 4 pixels + filter .4h + st1 {v6.4h}, [x0], #8 + st1 {v7.4h}, [x12], #8 + subs w5, w5, #4 // 0 <= w < 4 + ext v2.16b, v2.16b, v3.16b, #8 + ext v4.16b, v4.16b, v5.16b, #8 + b.eq 9f +888: // 1 <= w < 4, filter 1 pixel at a time + smull v6.4s, v2.4h, v0.4h + smull2 v7.4s, v2.8h, v0.8h + smull v16.4s, v4.4h, v0.4h + smull2 v17.4s, v4.8h, v0.8h + add v6.4s, v6.4s, v7.4s + add v16.4s, v16.4s, v17.4s + addv s6, v6.4s + addv s7, v16.4s + dup v16.4h, v2.h[3] + ins v16.h[1], v4.h[3] + ins v6.s[1], v7.s[0] + mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + ushll v16.4s, v16.4h, #7 + add v6.2s, v6.2s, v30.2s + add v6.2s, v6.2s, v16.2s + srshl v6.2s, v6.2s, v29.2s + sqxtun v6.4h, v6.4s + umin v6.4h, v6.4h, v24.4h + sub v6.4h, v6.4h, v31.4h + st1 {v6.h}[0], [x0], #2 + st1 {v6.h}[1], [x12], #2 + subs w5, w5, #1 + ext v2.16b, v2.16b, v3.16b, #2 + ext v4.16b, v4.16b, v5.16b, #2 + b.gt 888b + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x10 + add x12, x12, x10 + add x2, x2, x3 + add x13, x13, x3 + mov w5, w8 + b 1b +0: + ret +.purgem filter +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + ldr w8, [sp] // bitdepth_max + ld1 {v0.8h}, [x5] + dup v31.8h, w8 + clz w8, w8 + movi v1.8h, #128 + sub w8, w8, #11 // round_bits_v + add v1.8h, v1.8h, v0.8h + dup v30.4s, w8 + mov w8, w4 + neg v30.4s, v30.4s // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov w11, w4 + tst w6, #4 // LR_HAVE_TOP + b.eq 0f + sub x2, x2, x7, lsl #1 + add w11, w11, #2 +0: + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 1f + add w11, w11, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into v16-v19 and pad properly. + tst w6, #4 // LR_HAVE_TOP + ld1 {v16.8h}, [x2], x7 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.8h}, [x2], x7 + mov v17.16b, v16.16b + ld1 {v19.8h}, [x2], x7 + b 3f +2: // !LR_HAVE_TOP + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + +3: + cmp w4, #4 + b.lt 5f + // Start filtering normally; fill in v20-v22 with unique rows. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + ld1 {v22.8h}, [x2], x7 + +4: +.macro filter compare + subs w4, w4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + smull v2.4s, v16.4h, v0.h[0] + smlal v2.4s, v17.4h, v0.h[1] + smlal v2.4s, v18.4h, v0.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v0.h[4] + smlal v2.4s, v21.4h, v0.h[5] + smlal v2.4s, v22.4h, v0.h[6] + smull2 v3.4s, v16.8h, v0.h[0] + smlal2 v3.4s, v17.8h, v0.h[1] + smlal2 v3.4s, v18.8h, v0.h[2] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal2 v3.4s, v20.8h, v0.h[4] + smlal2 v3.4s, v21.8h, v0.h[5] + smlal2 v3.4s, v22.8h, v0.h[6] + srshl v2.4s, v2.4s, v30.4s // round_bits_v + srshl v3.4s, v3.4s, v30.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + umin v2.8h, v2.8h, v31.8h // bitdepth_max + st1 {v2.8h}, [x0], x1 +.if \compare + cmp w4, #4 +.else + b.le 9f +.endif + mov v16.16b, v17.16b + mov v17.16b, v18.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.endm + filter 1 + b.lt 7f + ld1 {v22.8h}, [x2], x7 + b 4b + +5: // Less than 4 rows in total; not all of v20-v21 are filled yet. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 6f + // LR_HAVE_BOTTOM + cmp w4, #2 + // We load at least 2 rows in all cases. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + b.gt 53f // 3 rows in total + b.eq 52f // 2 rows in total +51: // 1 row in total, v19 already loaded, load edge into v20-v22. + mov v22.16b, v21.16b + b 8f +52: // 2 rows in total, v19 already loaded, load v20 with content data + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + mov v23.16b, v22.16b + b 8f +53: + // 3 rows in total, v19 already loaded, load v20 and v21 with content + // and 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp w4, #2 + b.gt 63f // 3 rows in total + b.eq 62f // 2 rows in total +61: // 1 row in total, v19 already loaded, pad that into v20-v22. + mov v20.16b, v19.16b + mov v21.16b, v19.16b + mov v22.16b, v19.16b + b 8f +62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. + ld1 {v20.8h}, [x2], x7 + mov v21.16b, v20.16b + mov v22.16b, v20.16b + mov v23.16b, v20.16b + b 8f +63: + // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. + ld1 {v20.8h}, [x2], x7 + ld1 {v21.8h}, [x2], x7 + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + b 8f + +7: + // All registers up to v21 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst w6, #8 // LR_HAVE_BOTTOM + b.eq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + ld1 {v22.8h}, [x2], x7 + ld1 {v23.8h}, [x2], x7 + mov v24.16b, v23.16b + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + mov v22.16b, v21.16b + mov v23.16b, v21.16b + mov v24.16b, v21.16b + +8: // At this point, all registers up to v22-v24 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + mov v22.16b, v23.16b + mov v23.16b, v24.16b + b 8b + +9: // End of one vertical slice. + subs w3, w3, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + msub x0, x1, x8, x0 + msub x2, x7, x11, x2 + add x0, x0, #16 + add x2, x2, #16 + mov w4, w8 + b 1b + +0: + ret +.purgem filter +endfunc + +// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const pixel *src, int w, int h); +function copy_narrow_16bpc_neon, export=1 + adr x5, L(copy_narrow_tbl) + ldrh w6, [x5, w3, uxtw #1] + sub x5, x5, w6, uxth + br x5 +10: + add x7, x0, x1 + lsl x1, x1, #1 +18: + subs w4, w4, #8 + b.lt 110f + ld1 {v0.8h}, [x2], #16 + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x7], x1 + st1 {v0.h}[2], [x0], x1 + st1 {v0.h}[3], [x7], x1 + st1 {v0.h}[4], [x0], x1 + st1 {v0.h}[5], [x7], x1 + st1 {v0.h}[6], [x0], x1 + st1 {v0.h}[7], [x7], x1 + b.le 0f + b 18b +110: + add w4, w4, #8 + asr x1, x1, #1 +11: + subs w4, w4, #1 + ld1 {v0.h}[0], [x2], #2 + st1 {v0.h}[0], [x0], x1 + b.gt 11b +0: + ret + +20: + add x7, x0, x1 + lsl x1, x1, #1 +24: + subs w4, w4, #4 + b.lt 210f + ld1 {v0.4s}, [x2], #16 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x7], x1 + st1 {v0.s}[2], [x0], x1 + st1 {v0.s}[3], [x7], x1 + b.le 0f + b 24b +210: + add w4, w4, #4 + asr x1, x1, #1 +22: + subs w4, w4, #1 + ld1 {v0.s}[0], [x2], #4 + st1 {v0.s}[0], [x0], x1 + b.gt 22b +0: + ret + +30: + ldr w5, [x2] + ldrh w6, [x2, #4] + add x2, x2, #6 + subs w4, w4, #1 + str w5, [x0] + strh w6, [x0, #4] + add x0, x0, x1 + b.gt 30b + ret + +40: + add x7, x0, x1 + lsl x1, x1, #1 +42: + subs w4, w4, #2 + b.lt 41f + ld1 {v0.2d}, [x2], #16 + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x7], x1 + b.le 0f + b 42b +41: + ld1 {v0.4h}, [x2] + st1 {v0.4h}, [x0] +0: + ret + +50: + ldr x5, [x2] + ldrh w6, [x2, #8] + add x2, x2, #10 + subs w4, w4, #1 + str x5, [x0] + strh w6, [x0, #8] + add x0, x0, x1 + b.gt 50b + ret + +60: + ldr x5, [x2] + ldr w6, [x2, #8] + add x2, x2, #12 + subs w4, w4, #1 + str x5, [x0] + str w6, [x0, #8] + add x0, x0, x1 + b.gt 60b + ret + +70: + ldr x5, [x2] + ldr w6, [x2, #8] + ldrh w7, [x2, #12] + add x2, x2, #14 + subs w4, w4, #1 + str x5, [x0] + str w6, [x0, #8] + strh w7, [x0, #12] + add x0, x0, x1 + b.gt 70b + ret + +L(copy_narrow_tbl): + .hword 0 + .hword L(copy_narrow_tbl) - 10b + .hword L(copy_narrow_tbl) - 20b + .hword L(copy_narrow_tbl) - 30b + .hword L(copy_narrow_tbl) - 40b + .hword L(copy_narrow_tbl) - 50b + .hword L(copy_narrow_tbl) - 60b + .hword L(copy_narrow_tbl) - 70b +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 +1: + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w5, #14 + bic w13, w13, #7 + sub x4, x4, w13, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + sub x12, x12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #4 + + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + sub x12, x12, #4 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 2x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + sub x12, x12, #4 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + +2: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + cmp w5, #6 + b.ge 5f // If w >= 6, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro ext_n dst1, dst2, src1, src2, src3, n, w + ext \dst1, \src1, \src2, \n +.if \w > 4 + ext \dst2, \src2, \src3, \n +.endif +.endm +.macro add_n dst1, dst2, src1, src2, src3, src4, w + add \dst1, \src1, \src3 +.if \w > 4 + add \dst2, \src2, \src4 +.endif +.endm + +.macro add3 w, wd + ext v24.16b, v0.16b, v1.16b, #2 + ext v25.16b, v0.16b, v1.16b, #4 + ext v26.16b, v16.16b, v17.16b, #2 + ext v27.16b, v16.16b, v17.16b, #4 + add v6\wd, v0\wd, v24\wd + add v7\wd, v16\wd, v26\wd + add v6\wd, v6\wd, v25\wd + add v7\wd, v7\wd, v27\wd + + ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w + ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w + + add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w + + ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w + ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w + + add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w +.endm + add3 8, .8h + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + mov v2.16b, v4.16b + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + mov v18.16b, v20.16b + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 6 <= w < 10 + add3 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + + subs w5, w5, #4 // 2 <= w < 6 + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + +6: // Pad the right edge and produce the last few pixels. + // 2 <= w < 6, 2-5 pixels valid in v0 + sub w13, w5, #2 + // w13 = (pixels valid - 2) + adr x14, L(box3_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v16.16b, v16.16b, v31.16b, #12 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #6 + ext v16.16b, v16.16b, v16.16b, #6 + ext v0.16b, v0.16b, v30.16b, #10 + ext v16.16b, v16.16b, v31.16b, #10 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + ext v0.16b, v0.16b, v30.16b, #8 + ext v16.16b, v16.16b, v31.16b, #8 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #10 + ext v16.16b, v16.16b, v16.16b, #10 + ext v0.16b, v0.16b, v30.16b, #6 + ext v16.16b, v16.16b, v31.16b, #6 + b 88f + +L(box3_variable_shift_tbl): + .hword L(box3_variable_shift_tbl) - 22b + .hword L(box3_variable_shift_tbl) - 33b + .hword L(box3_variable_shift_tbl) - 44b + .hword L(box3_variable_shift_tbl) - 55b + +88: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + + add3 4, .4h + subs w5, w5, #4 + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + mov v2.16b, v3.16b + mov v3.16b, v4.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + // Only one needed pixel left, but do a normal 4 pixel + // addition anyway + add3 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + // Subtract the number of pixels read from the input from the stride. + tst w7, #2 // LR_HAVE_RIGHT + b.ne 0f + // !LR_HAVE_RIGHT + add w13, w5, #3 + bic w13, w13, #3 + add w14, w5, #13 + b 1f +0: + add w13, w5, #7 + bic w13, w13, #7 + add w14, w5, #15 +1: + sub x9, x9, w13, uxtw #1 + bic w14, w14, #7 + sub x4, x4, w14, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + sub x12, x12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #6 + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + sub x12, x12, #6 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 6 bytes we loaded before, + // which we shifted out. + sub x3, x3, #6 + sub x12, x12, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + +2: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #7 + b.ge 5f // If w >= 7, we can produce 4 pixels + b 6f + +4: // Loop horizontally +.macro add5 w, wd + ext v24.16b, v0.16b, v1.16b, #2 + ext v25.16b, v0.16b, v1.16b, #4 + ext v26.16b, v0.16b, v1.16b, #6 + ext v27.16b, v0.16b, v1.16b, #8 + + add v6\wd, v0\wd, v24\wd + add v25\wd, v25\wd, v26\wd + add v6\wd, v6\wd, v27\wd + + ext v26.16b, v16.16b, v17.16b, #2 + ext v27.16b, v16.16b, v17.16b, #4 + ext v28.16b, v16.16b, v17.16b, #6 + ext v29.16b, v16.16b, v17.16b, #8 + + add v7\wd, v16\wd, v26\wd + add v27\wd, v27\wd, v28\wd + add v7\wd, v7\wd, v29\wd + add v6\wd, v6\wd, v25\wd + add v7\wd, v7\wd, v27\wd + + ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w + ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w + ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w + + add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w + add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w + add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w + + ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w + ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w + ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w + + add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w + add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w + add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w +.endm + add5 8, .8h + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + subs w5, w5, #8 + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + mov v2.16b, v4.16b + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + mov v18.16b, v20.16b + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 7 <= w < 11 + add5 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + + subs w5, w5, #4 // 3 <= w < 7 + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + +6: // Pad the right edge and produce the last few pixels. + // w < 7, w+1 pixels valid in v0/v4 + sub w13, w5, #1 + // w13 = pixels valid - 2 + adr x14, L(box5_variable_shift_tbl) + ldrh w13, [x14, w13, uxtw #1] + mov v1.16b, v30.16b + mov v17.16b, v31.16b + sub x13, x14, w13, uxth + br x13 + // Shift v0 right, shifting out invalid pixels, + // shift v0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + ext v0.16b, v0.16b, v0.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v0.16b, v0.16b, v30.16b, #12 + ext v16.16b, v16.16b, v31.16b, #12 + b 88f +33: // 3 pixels valid + ext v0.16b, v0.16b, v0.16b, #6 + ext v16.16b, v16.16b, v16.16b, #6 + ext v0.16b, v0.16b, v30.16b, #10 + ext v16.16b, v16.16b, v31.16b, #10 + b 88f +44: // 4 pixels valid + ext v0.16b, v0.16b, v0.16b, #8 + ext v16.16b, v16.16b, v16.16b, #8 + ext v0.16b, v0.16b, v30.16b, #8 + ext v16.16b, v16.16b, v31.16b, #8 + b 88f +55: // 5 pixels valid + ext v0.16b, v0.16b, v0.16b, #10 + ext v16.16b, v16.16b, v16.16b, #10 + ext v0.16b, v0.16b, v30.16b, #6 + ext v16.16b, v16.16b, v31.16b, #6 + b 88f +66: // 6 pixels valid + ext v0.16b, v0.16b, v0.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v0.16b, v0.16b, v30.16b, #4 + ext v16.16b, v16.16b, v31.16b, #4 + b 88f +77: // 7 pixels valid + ext v0.16b, v0.16b, v0.16b, #14 + ext v16.16b, v16.16b, v16.16b, #14 + ext v0.16b, v0.16b, v30.16b, #2 + ext v16.16b, v16.16b, v31.16b, #2 + b 88f + +L(box5_variable_shift_tbl): + .hword L(box5_variable_shift_tbl) - 22b + .hword L(box5_variable_shift_tbl) - 33b + .hword L(box5_variable_shift_tbl) - 44b + .hword L(box5_variable_shift_tbl) - 55b + .hword L(box5_variable_shift_tbl) - 66b + .hword L(box5_variable_shift_tbl) - 77b + +88: + umull v2.4s, v0.4h, v0.4h + umull2 v3.4s, v0.8h, v0.8h + umull v4.4s, v1.4h, v1.4h + umull v18.4s, v16.4h, v16.4h + umull2 v19.4s, v16.8h, v16.8h + umull v20.4s, v17.4h, v17.4h + + add5 4, .4h + subs w5, w5, #4 + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + b.le 9f + ext v0.16b, v0.16b, v1.16b, #8 + ext v16.16b, v16.16b, v17.16b, #8 + mov v2.16b, v3.16b + mov v3.16b, v4.16b + mov v18.16b, v19.16b + mov v19.16b, v20.16b + add5 4, .4h + st1 {v6.4h}, [x1], #8 + st1 {v7.4h}, [x11], #8 + st1 {v22.4s}, [x0], #16 + st1 {v24.4s}, [x10], #16 + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +.purgem add5 +endfunc + +sgr_funcs 16 |