diff options
Diffstat (limited to 'third_party/dav1d/src/arm/32/cdef.S')
-rw-r--r-- | third_party/dav1d/src/arm/32/cdef.S | 540 |
1 files changed, 540 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S new file mode 100644 index 0000000000..4a0df6eac8 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef.S @@ -0,0 +1,540 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s1, #\w] + vmov.16 d4[1], r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmov.16 d4[2], r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[3], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr \n1, [\s1] + ldrh r12, [\s1, #\w] + vldr \n2, [\s2] + vdup.16 d4, r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr \n1, [\s1] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr dst, src, incr, w +.if \w == 4 + vld1.32 {\dst\()[0]}, [\src, :32], \incr +.else + vld1.8 {\dst\()}, [\src, :64], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro padding_func w, stride, n1, w1, n2, w2, align +function cdef_padding\w\()_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + cmp r7, #0xf // fully edged + beq cdef_padding\w\()_edged_8bpc_neon + vmov.i16 q3, #0x8000 + tst r7, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r8, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 + + // Middle section +3: + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.16 {d2[]}, [r3, :16]! + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmov.16 d2[1], r12 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {d2[]}, [r3, :16]! + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + vdup.16 d2, r12 + subs r6, r6, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmovl.u8 q0, d0 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r7, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r8,pc} +1: + // CDEF_HAVE_BOTTOM + add r8, r5, r2 + pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 +endfunc +.endm + +padding_func 8, 16, d0, q0, d2, q1, 128 +padding_func 4, 8, s0, d0, s4, d2, 64 + +// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_edged w, stride, reg, align +function cdef_padding\w\()_edged_8bpc_neon + sub r0, r0, #(2*\stride) + + ldrh r12, [r4, #-2] + vldr \reg, [r4] + add r8, r4, r2 + strh r12, [r0, #-2] + ldrh r12, [r4, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r8, #-2] + vldr \reg, [r8] + strh r12, [r0, #\stride-2] + ldrh r12, [r8, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + add r0, r0, #2*\stride + +0: + ldrh r12, [r3], #2 + vldr \reg, [r1] + str r12, [r0, #-2] + ldrh r12, [r1, #\w] + add r1, r1, r2 + subs r6, r6, #1 + vstr \reg, [r0] + str r12, [r0, #\w] + add r0, r0, #\stride + bgt 0b + + ldrh r12, [r5, #-2] + vldr \reg, [r5] + add r8, r5, r2 + strh r12, [r0, #-2] + ldrh r12, [r5, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r8, #-2] + vldr \reg, [r8] + strh r12, [r0, #\stride-2] + ldrh r12, [r8, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + + pop {r4-r8,pc} +endfunc +.endm + +padding_func_edged 8, 16, d0, 64 +padding_func_edged 4, 8, s0, 32 + +tables + +filter 8, 8 +filter 4, 8 + +find_dir 8 + +.macro load_px_8 d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.8 {\d11}, [r6] // p0 + add r6, r6, #16 // += stride + vld1.8 {\d21}, [r9] // p1 + add r9, r9, #16 // += stride + vld1.8 {\d12}, [r6] // p0 + vld1.8 {\d22}, [r9] // p1 +.else + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.32 {\d11[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d11[1]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[1]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d22[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[1]}, [r6] // p0 + vld1.32 {\d22[1]}, [r9] // p1 +.endif +.endm +.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u8 q3, q3, \s1 + vmax.u8 q4, q4, \s1 + vmin.u8 q3, q3, \s2 + vmax.u8 q4, q4, \s2 +.endif + vabd.u8 q8, q0, \s1 // abs(diff) + vabd.u8 q11, q0, \s2 // abs(diff) + vshl.u8 q9, q8, \shift // abs(diff) >> shift + vshl.u8 q12, q11, \shift // abs(diff) >> shift + vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vcgt.u8 q10, q0, \s1 // px > p0 + vcgt.u8 q13, q0, \s2 // px > p1 + vmin.u8 q9, q9, q8 // imin(abs(diff), clip) + vmin.u8 q12, q12, q11 // imin(abs(diff), clip) + vneg.s8 q8, q9 // -imin() + vneg.s8 q11, q12 // -imin() + vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) + vdup.8 d18, \tap // taps[k] + vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) + vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() + vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() +.endm + +// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func_8 w, pri, sec, min, suffix +function cdef_filter\w\suffix\()_edged_neon +.if \pri + movrel_local r8, pri_taps + and r9, r3, #1 + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u8 d17, #7 + vdup.8 d16, r6 // damping + + vmov.8 d8[0], r3 + vmov.8 d8[1], r4 + vclz.i8 d8, d8 // clz(threshold) + vsub.i8 d8, d17, d8 // ulog2(threshold) + vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s8 d8, d8 // -shift +.if \sec + vdup.8 q6, d8[1] +.endif +.if \pri + vdup.8 q5, d8[0] +.endif + +1: +.if \w == 8 + add r12, r2, #16 + vld1.8 {d0}, [r2, :64] // px + vld1.8 {d1}, [r12, :64] // px +.else + add r12, r2, #8 + vld1.32 {d0[0]}, [r2, :32] // px + add r9, r2, #2*8 + vld1.32 {d0[1]}, [r12, :32] // px + add r12, r12, #2*8 + vld1.32 {d1[0]}, [r9, :32] // px + vld1.32 {d1[1]}, [r12, :32] // px +.endif + + vmov.u8 q1, #0 // sum + vmov.u8 q2, #0 // sum +.if \min + vmov.u16 q3, q0 // min + vmov.u16 q4, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px_8 d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + vdup.8 q7, r3 // threshold + + handle_pixel_8 q14, q15, q7, q5, r12, \min +.endif + +.if \sec + load_px_8 d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + vdup.8 q7, r4 // threshold + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + load_px_8 d28, d29, d30, d31, \w + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vshr.s16 q15, q2, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vadd.i16 q2, q2, q15 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 + vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 + vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 +.if \min + vmin.u8 q0, q0, q4 + vmax.u8 q0, q0, q3 // iclip(px + .., min, max) +.endif +.if \w == 8 + vst1.8 {d0}, [r0, :64], r1 + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 + vst1.8 {d1}, [r0, :64], r1 +.else + vst1.32 {d0[0]}, [r0, :32], r1 + add r2, r2, #4*8 // tmp += 4*tmp_stride + vst1.32 {d0[1]}, [r0, :32], r1 + subs r7, r7, #4 // h -= 4 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter_8 w +filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri +filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec +filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec +.endm + +filter_8 8 +filter_8 4 |