/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #2 sub \s2, \s2, #2 tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr s1, [\s1, #\w] ldr \rn\()2, [\s2] ldr s3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str d1, [x0, #2*\w] add x0, x0, #2*\stride str \rw\()2, [x0] str d3, [x0, #2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str s1, [x0, #2*\w] str s31, [x0, #2*\w+4] add x0, x0, #2*\stride str \rw\()2, [x0] str s3, [x0, #2*\w] str s31, [x0, #2*\w+4] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()2, [x0, #4] str s3, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr \rn\()1, [\s2] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride .endif 3: .endm .macro load_n_incr dst, src, incr, w .if \w == 4 ld1 {\dst\().s}[0], [\src], \incr .else ld1 {\dst\().8b}, [\src], \incr .endif .endm // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func w, stride, rn, rw function cdef_padding\w\()_8bpc_neon, export=1 cmp w7, #0xf // fully edged b.eq cdef_padding\w\()_edged_8bpc_neon movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif b 3f 1: // CDEF_HAVE_TOP add x9, x4, x2 pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 // Middle section 3: tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b str s0, [x0] stur \rw\()1, [x0, #4] str s2, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.h}[0], [x3], #2 load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s0, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b b 3f 2: tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr h1, [x1, #\w] load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b 3: tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif ret 1: // CDEF_HAVE_BOTTOM add x9, x5, x2 pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 endfunc .endm padding_func 8, 16, d, q padding_func 4, 8, s, d // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg function cdef_padding\w\()_edged_8bpc_neon, export=1 sub x4, x4, #2 sub x5, x5, #2 sub x0, x0, #(2*\stride+2) .if \w == 4 ldr d0, [x4] ldr d1, [x4, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x4, x2 ldr d0, [x4] ldr s1, [x4, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] add x0, x0, #2*\stride .endif 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 str h0, [x0] stur \reg\()1, [x0, #2] str h2, [x0, #2+\w] add x0, x0, #\stride b.gt 0b .if \w == 4 ldr d0, [x5] ldr d1, [x5, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x5, x2 ldr d0, [x5] ldr s1, [x5, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] .endif ret endfunc .endm padding_func_edged 8, 16, d padding_func_edged 4, 8, s tables filter 8, 8 filter 4, 8 find_dir 8 .macro load_px_8 d1, d2, w .if \w == 8 add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().d}[0], [x6] // p0 add x6, x6, #16 // += stride ld1 {\d2\().d}[0], [x9] // p1 add x9, x9, #16 // += stride ld1 {\d1\().d}[1], [x6] // p0 ld1 {\d2\().d}[1], [x9] // p0 .else add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().s}[0], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[0], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[1], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[1], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[2], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[2], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[3], [x6] // p0 ld1 {\d2\().s}[3], [x9] // p1 .endif .endm .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min .if \min umin v3.16b, v3.16b, \s1\().16b umax v4.16b, v4.16b, \s1\().16b umin v3.16b, v3.16b, \s2\().16b umax v4.16b, v4.16b, \s2\().16b .endif uabd v16.16b, v0.16b, \s1\().16b // abs(diff) uabd v20.16b, v0.16b, \s2\().16b // abs(diff) ushl v17.16b, v16.16b, \shift // abs(diff) >> shift ushl v21.16b, v20.16b, \shift // abs(diff) >> shift uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) cmhi v18.16b, v0.16b, \s1\().16b // px > p0 cmhi v22.16b, v0.16b, \s2\().16b // px > p1 umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) dup v19.16b, \tap // taps[k] neg v16.16b, v17.16b // -imin() neg v20.16b, v21.16b // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint8_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h); .macro filter_func_8 w, pri, sec, min, suffix function cdef_filter\w\suffix\()_edged_8bpc_neon .if \pri movrel x8, pri_taps and w9, w3, #1 add x8, x8, w9, uxtw #1 .endif movrel x9, directions\w add x5, x9, w5, uxtw #1 movi v30.8b, #7 dup v28.8b, w6 // damping .if \pri dup v25.16b, w3 // threshold .endif .if \sec dup v27.16b, w4 // threshold .endif trn1 v24.8b, v25.8b, v27.8b clz v24.8b, v24.8b // clz(threshold) sub v24.8b, v30.8b, v24.8b // ulog2(threshold) uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) neg v24.8b, v24.8b // -shift .if \sec dup v26.16b, v24.b[1] .endif .if \pri dup v24.16b, v24.b[0] .endif 1: .if \w == 8 add x12, x2, #16 ld1 {v0.d}[0], [x2] // px ld1 {v0.d}[1], [x12] // px .else add x12, x2, #1*8 add x13, x2, #2*8 add x14, x2, #3*8 ld1 {v0.s}[0], [x2] // px ld1 {v0.s}[1], [x12] // px ld1 {v0.s}[2], [x13] // px ld1 {v0.s}[3], [x14] // px .endif // We need 9-bits or two 8-bit accululators to fit the sum. // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. // Start sum at -1 instead of 0 to help handle rounding later. movi v1.16b, #255 // sum movi v2.16b, #0 // sum .if \min mov v3.16b, v0.16b // min mov v4.16b, v0.16b // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov w11, #2 // sec_taps[0] 2: .if \pri ldrb w9, [x5] // off1 load_px_8 v5, v6, \w .endif .if \sec add x5, x5, #4 // +2*2 ldrb w9, [x5] // off2 load_px_8 v28, v29, \w .endif .if \pri ldrb w10, [x8] // *pri_taps handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min .endif .if \sec add x5, x5, #8 // +2*4 ldrb w9, [x5] // off3 load_px_8 v5, v6, \w handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; .else add x5, x5, #1 // x5 += 1 .endif subs w11, w11, #1 // sec_tap-- (value) .if \pri add x8, x8, #1 // pri_taps++ (pointer) .endif b.ne 2b // Perform halving adds since the value won't fit otherwise. // To handle the offset for negative values, use both halving w/ and w/o rounding. srhadd v5.16b, v1.16b, v2.16b // sum >> 1 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 cmlt v1.16b, v5.16b, #0 // sum < 0 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 .if \min umin v0.16b, v0.16b, v4.16b umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) .endif .if \w == 8 st1 {v0.d}[0], [x0], x1 add x2, x2, #2*16 // tmp += 2*tmp_stride subs w7, w7, #2 // h -= 2 st1 {v0.d}[1], [x0], x1 .else st1 {v0.s}[0], [x0], x1 add x2, x2, #4*8 // tmp += 4*tmp_stride st1 {v0.s}[1], [x0], x1 subs w7, w7, #4 // h -= 4 st1 {v0.s}[2], [x0], x1 st1 {v0.s}[3], [x0], x1 .endif // Reset pri_taps and directions back to the original point sub x5, x5, #2 .if \pri sub x8, x8, #2 .endif b.gt 1b ret endfunc .endm .macro filter_8 w filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec .endm filter_8 8 filter_8 4