diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/dav1d/src/arm | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/arm')
53 files changed, 66422 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S new file mode 100644 index 0000000000..4a0df6eac8 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef.S @@ -0,0 +1,540 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s1, #\w] + vmov.16 d4[1], r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmov.16 d4[2], r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[3], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldrh r12, [\s1, #-2] + vldr \n1, [\s1] + vdup.16 d4, r12 + ldrh r12, [\s2, #-2] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s8, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr \n1, [\s1] + ldrh r12, [\s1, #\w] + vldr \n2, [\s2] + vdup.16 d4, r12 + ldrh r12, [\s2, #\w] + vmovl.u8 q0, d0 + vmov.16 d4[1], r12 + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr \n1, [\s1] + vldr \n2, [\s2] + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\w2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr dst, src, incr, w +.if \w == 4 + vld1.32 {\dst\()[0]}, [\src, :32], \incr +.else + vld1.8 {\dst\()}, [\src, :64], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +// n1 = s0/d0 +// w1 = d0/q0 +// n2 = s4/d2 +// w2 = d2/q1 +.macro padding_func w, stride, n1, w1, n2, w2, align +function cdef_padding\w\()_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + cmp r7, #0xf // fully edged + beq cdef_padding\w\()_edged_8bpc_neon + vmov.i16 q3, #0x8000 + tst r7, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r8, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 + + // Middle section +3: + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.16 {d2[]}, [r3, :16]! + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmov.16 d2[1], r12 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {d2[]}, [r3, :16]! + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s4, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldrh r12, [r1, #\w] + load_n_incr d0, r1, r2, \w + vdup.16 d2, r12 + subs r6, r6, #1 + vmovl.u8 q0, d0 + vmovl.u8 q1, d2 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr d0, r1, r2, \w + subs r6, r6, #1 + vmovl.u8 q0, d0 + vstr s12, [r0, #-4] + vst1.16 {\w1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r7, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r8,pc} +1: + // CDEF_HAVE_BOTTOM + add r8, r5, r2 + pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 +endfunc +.endm + +padding_func 8, 16, d0, q0, d2, q1, 128 +padding_func 4, 8, s0, d0, s4, d2, 64 + +// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_edged w, stride, reg, align +function cdef_padding\w\()_edged_8bpc_neon + sub r0, r0, #(2*\stride) + + ldrh r12, [r4, #-2] + vldr \reg, [r4] + add r8, r4, r2 + strh r12, [r0, #-2] + ldrh r12, [r4, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r8, #-2] + vldr \reg, [r8] + strh r12, [r0, #\stride-2] + ldrh r12, [r8, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + add r0, r0, #2*\stride + +0: + ldrh r12, [r3], #2 + vldr \reg, [r1] + str r12, [r0, #-2] + ldrh r12, [r1, #\w] + add r1, r1, r2 + subs r6, r6, #1 + vstr \reg, [r0] + str r12, [r0, #\w] + add r0, r0, #\stride + bgt 0b + + ldrh r12, [r5, #-2] + vldr \reg, [r5] + add r8, r5, r2 + strh r12, [r0, #-2] + ldrh r12, [r5, #\w] + vstr \reg, [r0] + strh r12, [r0, #\w] + + ldrh r12, [r8, #-2] + vldr \reg, [r8] + strh r12, [r0, #\stride-2] + ldrh r12, [r8, #\w] + vstr \reg, [r0, #\stride] + strh r12, [r0, #\stride+\w] + + pop {r4-r8,pc} +endfunc +.endm + +padding_func_edged 8, 16, d0, 64 +padding_func_edged 4, 8, s0, 32 + +tables + +filter 8, 8 +filter 4, 8 + +find_dir 8 + +.macro load_px_8 d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.8 {\d11}, [r6] // p0 + add r6, r6, #16 // += stride + vld1.8 {\d21}, [r9] // p1 + add r9, r9, #16 // += stride + vld1.8 {\d12}, [r6] // p0 + vld1.8 {\d22}, [r9] // p1 +.else + add r6, r2, r9 // x + off + sub r9, r2, r9 // x - off + vld1.32 {\d11[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d11[1]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d21[1]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[0]}, [r6] // p0 + add r6, r6, #8 // += stride + vld1.32 {\d22[0]}, [r9] // p1 + add r9, r9, #8 // += stride + vld1.32 {\d12[1]}, [r6] // p0 + vld1.32 {\d22[1]}, [r9] // p1 +.endif +.endm +.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u8 q3, q3, \s1 + vmax.u8 q4, q4, \s1 + vmin.u8 q3, q3, \s2 + vmax.u8 q4, q4, \s2 +.endif + vabd.u8 q8, q0, \s1 // abs(diff) + vabd.u8 q11, q0, \s2 // abs(diff) + vshl.u8 q9, q8, \shift // abs(diff) >> shift + vshl.u8 q12, q11, \shift // abs(diff) >> shift + vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vcgt.u8 q10, q0, \s1 // px > p0 + vcgt.u8 q13, q0, \s2 // px > p1 + vmin.u8 q9, q9, q8 // imin(abs(diff), clip) + vmin.u8 q12, q12, q11 // imin(abs(diff), clip) + vneg.s8 q8, q9 // -imin() + vneg.s8 q11, q12 // -imin() + vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) + vdup.8 d18, \tap // taps[k] + vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) + vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() + vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() + vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() +.endm + +// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func_8 w, pri, sec, min, suffix +function cdef_filter\w\suffix\()_edged_neon +.if \pri + movrel_local r8, pri_taps + and r9, r3, #1 + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u8 d17, #7 + vdup.8 d16, r6 // damping + + vmov.8 d8[0], r3 + vmov.8 d8[1], r4 + vclz.i8 d8, d8 // clz(threshold) + vsub.i8 d8, d17, d8 // ulog2(threshold) + vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s8 d8, d8 // -shift +.if \sec + vdup.8 q6, d8[1] +.endif +.if \pri + vdup.8 q5, d8[0] +.endif + +1: +.if \w == 8 + add r12, r2, #16 + vld1.8 {d0}, [r2, :64] // px + vld1.8 {d1}, [r12, :64] // px +.else + add r12, r2, #8 + vld1.32 {d0[0]}, [r2, :32] // px + add r9, r2, #2*8 + vld1.32 {d0[1]}, [r12, :32] // px + add r12, r12, #2*8 + vld1.32 {d1[0]}, [r9, :32] // px + vld1.32 {d1[1]}, [r12, :32] // px +.endif + + vmov.u8 q1, #0 // sum + vmov.u8 q2, #0 // sum +.if \min + vmov.u16 q3, q0 // min + vmov.u16 q4, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px_8 d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + vdup.8 q7, r3 // threshold + + handle_pixel_8 q14, q15, q7, q5, r12, \min +.endif + +.if \sec + load_px_8 d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + vdup.8 q7, r4 // threshold + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + load_px_8 d28, d29, d30, d31, \w + + handle_pixel_8 q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vshr.s16 q15, q2, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vadd.i16 q2, q2, q15 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 + vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 + vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 +.if \min + vmin.u8 q0, q0, q4 + vmax.u8 q0, q0, q3 // iclip(px + .., min, max) +.endif +.if \w == 8 + vst1.8 {d0}, [r0, :64], r1 + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 + vst1.8 {d1}, [r0, :64], r1 +.else + vst1.32 {d0[0]}, [r0, :32], r1 + add r2, r2, #4*8 // tmp += 4*tmp_stride + vst1.32 {d0[1]}, [r0, :32], r1 + subs r7, r7, #4 // h -= 4 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter_8 w +filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri +filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec +filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec +.endm + +filter_8 8 +filter_8 4 diff --git a/third_party/dav1d/src/arm/32/cdef16.S b/third_party/dav1d/src/arm/32/cdef16.S new file mode 100644 index 0000000000..d14525d720 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef16.S @@ -0,0 +1,233 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// r1 = d0/q0 +// r2 = d2/q1 +.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s1, #2*\w] + vldr s10, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vldr s11, [\s2, #2*\w] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vldr s8, [\s1, #2*\w] + vld1.16 {\r2}, [\s2, :\align] + vldr s9, [\s2, #2*\w] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vld1.16 {\r2}, [\s2, :\align] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r8,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +// r1 = d0/q0 +// r2 = d2/q1 +.macro padding_func_16 w, stride, r1, r2, align +function cdef_padding\w\()_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + vmov.i16 q3, #0x8000 + tst r7, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r8, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0 + + // Middle section +3: + tst r7, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.32 {d2[]}, [r3, :32]! + vldr s5, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r6, r6, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.32 {d2[]}, [r3, :32]! + vld1.16 {\r1}, [r1, :\align], r2 + subs r6, r6, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r7, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vldr s4, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r6, r6, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [r1, :\align], r2 + subs r6, r6, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r7, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r8,pc} +1: + // CDEF_HAVE_BOTTOM + add r8, r5, r2 + pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1 +endfunc +.endm + +padding_func_16 8, 16, q0, q1, 128 +padding_func_16 4, 8, d0, d2, 64 + +tables + +filter 8, 16 +filter 4, 16 + +find_dir 16 diff --git a/third_party/dav1d/src/arm/32/cdef_tmpl.S b/third_party/dav1d/src/arm/32/cdef_tmpl.S new file mode 100644 index 0000000000..33ff9e5816 --- /dev/null +++ b/third_party/dav1d/src/arm/32/cdef_tmpl.S @@ -0,0 +1,515 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro dir_table w, stride +const directions\w + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 + .byte 1 * \stride + 0, 2 * \stride + 0 + .byte 1 * \stride + 0, 2 * \stride - 1 +// Repeated, to avoid & 7 + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 +endconst +.endm + +.macro tables +dir_table 8, 16 +dir_table 4, 8 + +const pri_taps + .byte 4, 2, 3, 3 +endconst +.endm + +.macro load_px d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11,\d12}, [r6] // p0 + vld1.16 {\d21,\d22}, [r9] // p1 +.else + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11}, [r6] // p0 + add r6, r6, #2*8 // += stride + vld1.16 {\d21}, [r9] // p1 + add r9, r9, #2*8 // += stride + vld1.16 {\d12}, [r6] // p0 + vld1.16 {\d22}, [r9] // p1 +.endif +.endm +.macro handle_pixel s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u16 q2, q2, \s1 + vmax.s16 q3, q3, \s1 + vmin.u16 q2, q2, \s2 + vmax.s16 q3, q3, \s2 +.endif + vabd.u16 q8, q0, \s1 // abs(diff) + vabd.u16 q11, q0, \s2 // abs(diff) + vshl.u16 q9, q8, \shift // abs(diff) >> shift + vshl.u16 q12, q11, \shift // abs(diff) >> shift + vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vsub.i16 q10, \s1, q0 // diff = p0 - px + vsub.i16 q13, \s2, q0 // diff = p1 - px + vneg.s16 q8, q9 // -clip + vneg.s16 q11, q12 // -clip + vmin.s16 q10, q10, q9 // imin(diff, clip) + vmin.s16 q13, q13, q12 // imin(diff, clip) + vdup.16 q9, \tap // taps[k] + vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) + vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) + vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() + vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() +.endm + +// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func w, bpc, pri, sec, min, suffix +function cdef_filter\w\suffix\()_\bpc\()bpc_neon +.if \bpc == 8 + cmp r8, #0xf + beq cdef_filter\w\suffix\()_edged_neon +.endif +.if \pri +.if \bpc == 16 + clz r9, r9 + sub r9, r9, #24 // -bitdepth_min_8 + neg r9, r9 // bitdepth_min_8 +.endif + movrel_local r8, pri_taps +.if \bpc == 16 + lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 + and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 +.else + and r9, r3, #1 +.endif + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u16 d17, #15 + vdup.16 d16, r6 // damping + +.if \pri + vdup.16 q5, r3 // threshold +.endif +.if \sec + vdup.16 q7, r4 // threshold +.endif + vmov.16 d8[0], r3 + vmov.16 d8[1], r4 + vclz.i16 d8, d8 // clz(threshold) + vsub.i16 d8, d17, d8 // ulog2(threshold) + vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s16 d8, d8 // -shift +.if \sec + vdup.16 q6, d8[1] +.endif +.if \pri + vdup.16 q4, d8[0] +.endif + +1: +.if \w == 8 + vld1.16 {q0}, [r2, :128] // px +.else + add r12, r2, #2*8 + vld1.16 {d0}, [r2, :64] // px + vld1.16 {d1}, [r12, :64] // px +.endif + + vmov.u16 q1, #0 // sum +.if \min + vmov.u16 q2, q0 // min + vmov.u16 q3, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + + handle_pixel q14, q15, q5, q4, r12, \min +.endif + +.if \sec + load_px d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + handle_pixel q14, q15, q7, q6, lr, \min + + load_px d28, d29, d30, d31, \w + + handle_pixel q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 +.if \min + vmin.s16 q0, q0, q3 + vmax.s16 q0, q0, q2 // iclip(px + .., min, max) +.endif +.if \bpc == 8 + vmovn.u16 d0, q0 +.endif +.if \w == 8 + add r2, r2, #2*16 // tmp += tmp_stride + subs r7, r7, #1 // h-- +.if \bpc == 8 + vst1.8 {d0}, [r0, :64], r1 +.else + vst1.16 {q0}, [r0, :128], r1 +.endif +.else +.if \bpc == 8 + vst1.32 {d0[0]}, [r0, :32], r1 +.else + vst1.16 {d0}, [r0, :64], r1 +.endif + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 +.if \bpc == 8 + vst1.32 {d0[1]}, [r0, :32], r1 +.else + vst1.16 {d1}, [r0, :64], r1 +.endif +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter w, bpc +filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri +filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec +filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec + +function cdef_filter\w\()_\bpc\()bpc_neon, export=1 + push {r4-r9,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #92] + ldrd r6, r7, [sp, #100] +.if \bpc == 16 + ldrd r8, r9, [sp, #108] +.else + ldr r8, [sp, #108] +.endif + cmp r3, #0 // pri_strength + bne 1f + b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec +1: + cmp r4, #0 // sec_strength + bne 1f + b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri +1: + b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec +endfunc +.endm + +const div_table, align=4 + .short 840, 420, 280, 210, 168, 140, 120, 105 +endconst + +const alt_fact, align=4 + .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 +endconst + +.macro cost_alt dest, s1, s2, s3, s4, s5, s6 + vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] + vmull.s16 q2, \s2, \s2 + vmull.s16 q3, \s3, \s3 + vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] + vmull.s16 q12, \s5, \s5 + vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here + vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact + vmla.i32 q1, q2, q14 + vmla.i32 q1, q3, q15 + vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact + vmla.i32 q5, q12, q14 + vmla.i32 q5, q6, q15 + vadd.i32 d2, d2, d3 + vadd.i32 d3, d10, d11 + vpadd.i32 \dest, d2, d3 // *cost_ptr +.endm + +.macro find_best s1, s2, s3 +.ifnb \s2 + vmov.32 lr, \s2 +.endif + cmp r12, r1 // cost[n] > best_cost + itt gt + movgt r0, r3 // best_dir = n + movgt r1, r12 // best_cost = cost[n] +.ifnb \s2 + add r3, r3, #1 // n++ + cmp lr, r1 // cost[n] > best_cost + vmov.32 r12, \s3 + itt gt + movgt r0, r3 // best_dir = n + movgt r1, lr // best_cost = cost[n] + add r3, r3, #1 // n++ +.endif +.endm + +// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, +// unsigned *const var) +.macro find_dir bpc +function cdef_find_dir_\bpc\()bpc_neon, export=1 + push {lr} + vpush {q4-q7} +.if \bpc == 16 + clz r3, r3 // clz(bitdepth_max) + sub lr, r3, #24 // -bitdepth_min_8 +.endif + sub sp, sp, #32 // cost + mov r3, #8 + vmov.u16 q1, #0 // q0-q1 sum_diag[0] + vmov.u16 q3, #0 // q2-q3 sum_diag[1] + vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] + vmov.u16 q8, #0 // q6,d16 sum_alt[0] + // q7,d17 sum_alt[1] + vmov.u16 q9, #0 // q9,d22 sum_alt[2] + vmov.u16 q11, #0 + vmov.u16 q10, #0 // q10,d23 sum_alt[3] + + +.irpc i, 01234567 +.if \bpc == 8 + vld1.8 {d30}, [r0, :64], r1 + vmov.u8 d31, #128 + vsubl.u8 q15, d30, d31 // img[x] - 128 +.else + vld1.16 {q15}, [r0, :128], r1 + vdup.16 q14, lr // -bitdepth_min_8 + vshl.u16 q15, q15, q14 + vmov.u16 q14, #128 + vsub.i16 q15, q15, q14 // img[x] - 128 +.endif + vmov.u16 q14, #0 + +.if \i == 0 + vmov q0, q15 // sum_diag[0] +.else + vext.8 q12, q14, q15, #(16-2*\i) + vext.8 q13, q15, q14, #(16-2*\i) + vadd.i16 q0, q0, q12 // sum_diag[0] + vadd.i16 q1, q1, q13 // sum_diag[0] +.endif + vrev64.16 q13, q15 + vswp d26, d27 // [-x] +.if \i == 0 + vmov q2, q13 // sum_diag[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q2, q2, q12 // sum_diag[1] + vadd.i16 q3, q3, q13 // sum_diag[1] +.endif + + vpadd.u16 d26, d30, d31 // [(x >> 1)] + vmov.u16 d27, #0 + vpadd.u16 d24, d26, d28 + vpadd.u16 d24, d24, d28 // [y] + vmov.u16 r12, d24[0] + vadd.i16 q5, q5, q15 // sum_hv[1] +.if \i < 4 + vmov.16 d8[\i], r12 // sum_hv[0] +.else + vmov.16 d9[\i-4], r12 // sum_hv[0] +.endif + +.if \i == 0 + vmov.u16 q6, q13 // sum_alt[0] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q14, q13, q14, #(16-2*\i) + vadd.i16 q6, q6, q12 // sum_alt[0] + vadd.i16 d16, d16, d28 // sum_alt[0] +.endif + vrev64.16 d26, d26 // [-(x >> 1)] + vmov.u16 q14, #0 +.if \i == 0 + vmov q7, q13 // sum_alt[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q7, q7, q12 // sum_alt[1] + vadd.i16 d17, d17, d26 // sum_alt[1] +.endif + +.if \i < 6 + vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) + vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) + vadd.i16 q9, q9, q12 // sum_alt[2] + vadd.i16 d22, d22, d26 // sum_alt[2] +.else + vadd.i16 q9, q9, q15 // sum_alt[2] +.endif +.if \i == 0 + vmov q10, q15 // sum_alt[3] +.elseif \i == 1 + vadd.i16 q10, q10, q15 // sum_alt[3] +.else + vext.8 q12, q14, q15, #(16-2*(\i/2)) + vext.8 q13, q15, q14, #(16-2*(\i/2)) + vadd.i16 q10, q10, q12 // sum_alt[3] + vadd.i16 d23, d23, d26 // sum_alt[3] +.endif +.endr + + vmov.u32 q15, #105 + + vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] + vmlal.s16 q12, d9, d9 + vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] + vmlal.s16 q13, d11, d11 + vadd.s32 d8, d24, d25 + vadd.s32 d9, d26, d27 + vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) + vmul.i32 d8, d8, d30 // cost[2,6] *= 105 + + vrev64.16 q1, q1 + vrev64.16 q3, q3 + vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] + vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] + + vstr s16, [sp, #2*4] // cost[2] + vstr s17, [sp, #6*4] // cost[6] + + movrel_local r12, div_table + vld1.16 {q14}, [r12, :128] + + vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] + vmull.s16 q12, d1, d1 + vmlal.s16 q5, d2, d2 + vmlal.s16 q12, d3, d3 + vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] + vmull.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmovl.u16 q13, d28 // div_table + vmovl.u16 q14, d29 + vmul.i32 q5, q5, q13 // cost[0] + vmla.i32 q5, q12, q14 + vmul.i32 q0, q0, q13 // cost[4] + vmla.i32 q0, q1, q14 + vadd.i32 d10, d10, d11 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 + + movrel_local r12, alt_fact + vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 + + vstr s0, [sp, #0*4] // cost[0] + vstr s1, [sp, #4*4] // cost[4] + + vmovl.u16 q13, d29 // div_table[2*m+1] + 105 + vmovl.u16 q14, d30 + vmovl.u16 q15, d31 + + cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] + cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] + vstr s28, [sp, #1*4] // cost[1] + vstr s29, [sp, #3*4] // cost[3] + + mov r0, #0 // best_dir + vmov.32 r1, d0[0] // best_cost + mov r3, #1 // n + + vstr s30, [sp, #5*4] // cost[5] + vstr s31, [sp, #7*4] // cost[7] + + vmov.32 r12, d14[0] + + find_best d14[0], d8[0], d14[1] + find_best d14[1], d0[1], d15[0] + find_best d15[0], d8[1], d15[1] + find_best d15[1] + + eor r3, r0, #4 // best_dir ^4 + ldr r12, [sp, r3, lsl #2] + sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] + lsr r1, r1, #10 + str r1, [r2] // *var + + add sp, sp, #32 + vpop {q4-q7} + pop {pc} +endfunc +.endm diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S new file mode 100644 index 0000000000..d1f83efb98 --- /dev/null +++ b/third_party/dav1d/src/arm/32/filmgrain.S @@ -0,0 +1,2039 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r5, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r6, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r7, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r8, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r9, q0 + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r10, q0 +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5, \r6, \r7}, [r0]! + vst1.16 {\r8, \r9}, [r0]! + vst1.16 {\r10[0]}, [r0]! +.endm + +.macro get_grain_row_44 r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + add r11, r3, r11, lsl #1 + read_rand r12, 11, 0 + vld1.16 {d0[2]}, [r11] + add r12, r3, r12, lsl #1 + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r5, q0 +.endm + +.macro store_grain_row_44 r0, r1, r2, r3, r4, r5 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5}, [r0] + add r0, r0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 d0, q0 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon + push {r0, lr} +.if \n == 1 + mov lr, #-128 +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #1 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.8 d1[7], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + vmull.s8 q2, d6, d28 + vmull.s8 q3, d7, d28 + vmull.s8 q4, d0, d27 + vmull.s8 q5, d1, d27 + + vaddl.s16 q0, d4, d8 + vaddl.s16 q2, d5, d9 + vaddl.s16 q4, d6, d10 + vaddl.s16 q5, d7, d11 + + vmull.s8 q3, d3, d29 + vmull.s8 q1, d2, d29 + + vaddw.s16 q4, q4, d6 + vaddw.s16 q5, q5, d7 + vaddw.s16 q3, q2, d3 + vaddw.s16 q2, q0, d2 + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff +.ifc \lag\()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag\()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vpaddl.s8 q6, q6 + vpaddl.s8 q7, q7 + vadd.i16 q0, q0, q6 + vadd.i16 q1, q1, q7 + vpop {q6-q7} + vrshrn.s16 d0, q0, #2 + vrshrn.s16 d1, q1, #2 +.endif +.ifc \type, uv_422 + vld1.8 {q0, q1}, [r11]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vrshrn.s16 d0, q0, #1 + vrshrn.s16 d1, q1, #1 +.endif +.ifc \type, uv_444 + vld1.8 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff +.endif + vmull.s8 q1, d0, d13 + vmull.s8 q0, d1, d13 + vaddw.s16 q2, q2, d2 + vaddw.s16 q3, q3, d3 + vaddw.s16 q4, q4, d0 + vaddw.s16 q5, q5, d1 +.endif +.if \uv_layout && \elems == 16 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: + push {r11} +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vmovn.i16 d1, q0 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s8 r10, d1[5] +.endif +.ifnc \lag, lag1 + vmov.s8 r8, d1[6] +.endif + vmov.s8 r6, d1[7] + + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 + mov r1, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + vmov q1, q4 +.if \elems == 9 + mov r1, #1 + bl output_\lag\()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vmovn.i16 d2, q1 + vext.8 q0, q0, q1, #7 +.else + mov r1, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + vmov q1, q5 + +.ifc \edge, right + mov r1, #3 + bl output_\lag\()_neon + read_shift_rand r11, 11 + add r11, r3, r11, lsl #1 + vld1.16 {d2[0]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #1 +.else + mov r1, #4 + bl output_\lag\()_neon +.endif +.endif +.if \store + vst1.8 {q0}, [r0]! +.endif + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag1_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + vmov q3, \mid + vext.8 q0, \left, \mid, #15 + vext.8 q1, \mid, \right, #1 + bl sum_\type\()_lag1_\edge\()_neon + vmov \dst, q0 +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH - 16 + sub lr, r0, #1*GRAIN_WIDTH - 16 + vld1.8 {q10}, [r12] // load top right + vld1.8 {q13}, [lr] + + vext.8 q6, q8, q9, #14 // top left, top mid + vdup.8 d14, d28[0] + vext.8 q8, q8, q9, #15 + vdup.8 d15, d28[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d16 + vaddl.s16 q5, d3, d17 + + vext.8 q6, q9, q10, #1 // top mid, top right + vdup.8 d14, d28[3] + vext.8 q8, q9, q10, #2 + vdup.8 d15, d28[4] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q11, q12, #14 // top left, top mid + vdup.8 d14, d28[5] + vext.8 q8, q11, q12, #15 + vdup.8 d15, d28[6] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q12, q13, #1 // top mid, top right + vdup.8 d14, d29[0] + vext.8 q8, q12, q13, #2 + vdup.8 d15, d29[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vdup.8 d14, d28[2] + vdup.8 d15, d28[7] + + vmull.s8 q0, d18, d14 + vmull.s8 q1, d19, d14 + vmull.s8 q6, d24, d15 + vmull.s8 q8, d25, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vmov q8, q9 + vmov q9, q10 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag2_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH + sub lr, r0, #1*GRAIN_WIDTH + vld1.8 {q9}, [r12] // load the previous block right above + vld1.8 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #13 + vext.8 q11, q11, q11, #13 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + sub r12, r0, #3*GRAIN_WIDTH + 3 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d20, d26[0] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d26[1] + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vext.8 q8, q11, q12, #2 + vdup.8 d20, d26[2] + vext.8 q9, q11, q12, #3 + vdup.8 d21, d26[3] + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d14 + vaddl.s16 q5, d3, d15 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #4 + vdup.8 d20, d26[4] + vext.8 q7, q11, q12, #5 + vdup.8 d21, d26[5] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + sub r12, r0, #2*GRAIN_WIDTH + 3 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #6 + vld1.8 {q11, q12}, [r12] + vdup.8 d20, d26[6] + vdup.8 d21, d26[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d22, d21 + vmull.s8 q7, d23, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #1 + vdup.8 d20, d27[0] + vext.8 q7, q11, q12, #2 + vdup.8 d21, d27[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #3 + vdup.8 d20, d27[2] + vext.8 q9, q11, q12, #4 + vdup.8 d21, d27[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + sub r12, r0, #1*GRAIN_WIDTH + 3 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #5 + vdup.8 d20, d27[4] + vext.8 q7, q11, q12, #6 + vdup.8 d21, d27[5] + + vld1.8 {q11, q12}, [r12] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vdup.8 d20, d27[6] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d27[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #2 + vdup.8 d20, d28[0] + vext.8 q7, q11, q12, #3 + vdup.8 d21, d28[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #4 + vdup.8 d20, d28[2] + vext.8 q9, q11, q12, #5 + vdup.8 d21, d28[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #6 + vdup.8 d20, d28[4] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + + vaddw.s16 q2, q2, d0 + vaddw.s16 q3, q3, d1 + vaddw.s16 q4, q4, d2 + vaddw.s16 q5, q5, d3 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag3_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + push {r11,lr} +1: + get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + subs r1, r1, #1 + store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + bgt 1b + pop {r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r11,lr} +1: + get_grain_row_44 d16, d17, d18, d19, d20, d21 + subs r1, r1, #1 + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + pop {r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.8 {q3}, [r11]! + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q8, q0, q15 + bl get_gaussian_neon + vrshl.s16 q9, q0, q15 + vqmovn.s16 d0, q8 + vqmovn.s16 d1, q9 + + vand q3, q3, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + vst1.8 {q2}, [r0]! + pop {r11,pc} +endfunc + +function get_grain_row_44_neon + push {r11,lr} + get_grain_row_44 d16, d17, d18, d19, d20, d21 + pop {r11,pc} +endfunc + +function add_uv_420_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vld1.16 {q4, q5}, [r12]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vpaddl.s8 q4, q4 + vpaddl.s8 q5, q5 + vadd.i16 q2, q2, q4 + vadd.i16 q3, q3, q5 + vrshrn.s16 d4, q2, #2 + vrshrn.s16 d5, q3, #2 + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vrshrn.s16 d4, q2, #1 + vrshrn.s16 d5, q3, #1 + +add_coeff_lag0_start: + vand q3, q2, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + bx lr +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH + mov r1, r2 + mul r12, r12, lr +.endif + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #1 + vneg.s16 q12, q12 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 16 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 64 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #2 + vst1.16 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + sum_\type\()_lag1 q7, q8, q8, q9, left + sum_\type\()_lag1 q8, q8, q9, q10 + sum_\type\()_lag1 q9, q9, q10, q11 + sum_\type\()_lag1 q10, q10, q11, q12 + sum_\type\()_lag1 q12, q11, q12, q13, right + get_grain_2 d26 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 + vmov q11, q10 + vmov q10, q9 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH-3 + mov r1, r2 + mul r12, r12, lr + + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #7 + vneg.s16 q12, q12 + +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add r12, r11, #GRAIN_WIDTH +.endif + vmov q1, q13 + vmov q0, q8 + bl add_\type\()_coeff_lag0_neon + vmov.i8 q1, #255 + vmov q0, q9 + vmov q8, q2 + bl add_\type\()_coeff_lag0_neon + vmov.i8 q1, q14 + vmov q0, q10 + vmov q9, q2 + bl add_\type\()_coeff_lag0_neon + vmov q10, q2 + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + sum_\type\()_lag1 q7, q8, q8, q9, left + sum_\type\()_lag1 q8, q8, q9, q10 + sum_\type\()_lag1 q10, q9, q10, q11, right + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d14, d15, d16, d17, d20, d21 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + vmov.u8 r11, \src1[0+\off] + vmov.u8 r12, \src2[0+\off] + add r11, r11, r3 + vmov.u8 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u8 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u8 r12, \src1[4+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u8 lr, \src2[4+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u8 r11, \src1[6+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u8 r12, \src2[6+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 + gather_interleaved \dst1, \dst3, \src1, \src3, 0 + gather_interleaved \dst1, \dst3, \src1, \src3, 1 + gather_interleaved \dst2, \dst4, \src2, \src4, 0 + gather_interleaved \dst2, \dst4, \src2, \src4, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, 0 + gather_interleaved d8, d9, d0, d1, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH // grain_lut stride + + neg r4, r4 + vdup.16 q13, r4 // -scaling_shift + cmp r8, #0 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i8 q14, #16 + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #9 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.8 d14, d24[0] + vdup.8 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.8 {d8}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.8 {q2, q3}, [r6], r9 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r8], r9 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r1, :128], r2 // src + vld1.8 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d4, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d4, q5, #5 +.endif + + vmull.s8 q4, d20, d15 + vmull.s8 q5, d21, d15 + vmull.s8 q8, d22, d15 + vmull.s8 q9, d23, d15 + vmlal.s8 q4, d4, d14 + vmlal.s8 q5, d5, d14 + vmlal.s8 q8, d6, d14 + vmlal.s8 q9, d7, d14 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q8, #5 + vqrshrn.s16 d23, q9, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif + + bl gather32_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q2, d8 // scaling + vmovl.u8 q3, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q2 // scaling * grain + vmul.i16 q9, q9, q3 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r7, r7, #1 +.if \oy + vdup.8 d14, d25[0] + vdup.8 d15, d25[1] +.endif + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + ldrd r8, r9, [sp, #116] // offsets, h + ldrd r10, r11, [sp, #124] // uv, is_id + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + add r10, r10, #FGD_UV_OFFSET + vld1.16 {d4[]}, [r12] // uv_luma_mult + vld1.16 {d4[2]}, [r10] // uv_offset + vld1.16 {d4[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg lr, lr // -scaling_shift + + cmp r12, #0 + vdup.16 q13, lr // -scaling_shift + + beq 1f + // clip + cmp r11, #0 + vmov.i8 q14, #16 + vmov.i8 q15, #240 + beq 2f + // is_id + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + mov r10, #GRAIN_WIDTH // grain_lut stride + + add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: + +.if \sy + vmov.i8 d6, #23 + vmov.i8 d7, #22 +.else + vmov.i8 d6, #27 + vmov.i8 d7, #17 +.endif + +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8, q9}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmull.s8 q6, d22, d7 + vmull.s8 q7, d23, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vmlal.s8 q6, d18, d6 + vmlal.s8 q7, d19, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q6, #5 + vqrshrn.s16 d23, q7, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if !\csfl + vld1.8 {q8, q9}, [r1, :128] // src + vmovl.u8 q4, d0 + vmovl.u8 q5, d1 + vmovl.u8 q6, d2 + vmovl.u8 q7, d3 + vmovl.u8 q0, d16 + vmovl.u8 q1, d17 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q6, q6, d4[0] + vmul.i16 q7, q7, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vmul.i16 q8, q8, d4[1] + vmul.i16 q9, q9, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vqadd.s16 q6, q6, q8 + vqadd.s16 q7, q7, q9 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vshr.s16 q6, q6, #6 + vshr.s16 q7, q7, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vadd.i16 q6, q6, q0 + vadd.i16 q7, q7, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 + vqmovun.s16 d3, q7 +.endif + + bl gather32_neon + + vld1.8 {q0, q1}, [r1, :128], r2 // src + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r9, r9, #1 +.if \oy + vdup.8 d6, d25[0] + vdup.8 d7, d25[1] +.endif + + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10}, [r5], r10 // grain_lut + vld1.8 {q11}, [r1, :128], r2 // src + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if \csfl + vrshrn.u16 d0, q0, #1 + vrshrn.u16 d1, q1, #1 +.else + vrshr.u16 q4, q0, #1 + vrshr.u16 q5, q1, #1 + vmovl.u8 q0, d22 + vmovl.u8 q1, d23 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 +.endif + + bl gather16_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + + vaddw.u8 q8, q8, d22 // *src + noise + vaddw.u8 q9, q9, d23 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + + vmax.u8 q0, q0, q14 + vmin.u8 q0, q0, q15 + + subs r9, r9, #1 +.if \oy + vswp d6, d7 +.endif + vst1.8 {q0}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S new file mode 100644 index 0000000000..6c36caceae --- /dev/null +++ b/third_party/dav1d/src/arm/32/filmgrain16.S @@ -0,0 +1,2137 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +function get_grain_4_neon + push {r11,lr} + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[2]}, [r11] + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon + push {r0, lr} +.if \n == 1 + mvn lr, r5 // grain_min = ~grain_max +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #2 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.16 d1[3], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub r12, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + + vext.8 q0, q8, q9, #14 // top left, top mid + vext.8 q1, q9, q10, #2 // top left, top mid + + vmull.s16 q2, d18, d28 + vmlal.s16 q2, d0, d27 + vmlal.s16 q2, d2, d29 + vmull.s16 q3, d19, d28 + vmlal.s16 q3, d1, d27 + vmlal.s16 q3, d3, d29 + + vmov q8, q9 + vmov q9, q10 + + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff +.ifc \lag\()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag\()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d12, d12, d13 + vpadd.i16 d13, d14, d15 + vadd.i16 q0, q0, q6 + vpop {q6-q7} + vrshr.s16 q0, q0, #2 +.endif +.ifc \type, uv_422 + vld1.16 {q0, q1}, [r11]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vrshr.s16 q0, q0, #1 +.endif +.ifc \type, uv_444 + vld1.16 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff + vmovl.s8 q6, d13 +.endif + vmlal.s16 q2, d0, d13 + vmlal.s16 q3, d1, d13 +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: + push {r11} +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s16 r10, d1[1] +.endif +.ifnc \lag, lag1 + vmov.s16 r8, d1[2] +.endif + vmov.s16 r6, d1[3] + + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 +.ifc \edge, right + mov r1, #3 + bl output_\lag\()_neon + read_shift_rand r12, 11 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r12] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #2 +.else + mov r1, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #14 +.endif + vst1.16 {q0}, [r0]! + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #1*GRAIN_WIDTH*2 + vld1.8 {q9}, [r12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH*2 - 16 + sub lr, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + vld1.16 {q13}, [lr] + + vdup.8 d10, d28[0] + vext.8 q0, q8, q9, #12 // top left, top mid + vdup.8 d12, d28[1] + vext.8 q1, q8, q9, #14 + vdup.8 d14, d28[3] + vext.8 q4, q9, q10, #2 // top mid, top right + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmull.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmull.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d28[4] + vext.8 q0, q9, q10, #4 // top mid, top right + vdup.8 d12, d28[5] + vext.8 q1, q11, q12, #12 // top left, top mid + vdup.8 d14, d28[6] + vext.8 q4, q11, q12, #14 + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d29[0] + vext.8 q0, q12, q13, #2 // top mid, top right + vdup.8 d12, d29[1] + vext.8 q1, q12, q13, #4 + + vdup.8 d14, d28[2] + vdup.8 d8, d28[7] + + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q4, d8 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d18, d14 + vmlal.s16 q2, d24, d8 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d19, d14 + vmlal.s16 q3, d25, d8 + + vmov q8, q9 + vmov q9, q10 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH*2 + sub lr, r0, #1*GRAIN_WIDTH*2 + vld1.16 {q9}, [r12] // load the previous block right above + vld1.16 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH*2 + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #10 + vext.8 q11, q11, q11, #10 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + movw r12, #(3*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d12, d26[0] + vext.8 q1, q11, q12, #2 + vdup.8 d14, d26[1] + vext.8 q4, q11, q12, #4 + vdup.8 d16, d26[2] + vext.8 q5, q11, q12, #6 + vdup.8 d18, d26[3] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + movw r12, #(2*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + + vmull.s16 q2, d22, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmull.s16 q3, d23, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d26[4] + vext.8 q0, q11, q12, #8 + vdup.8 d14, d26[5] + vext.8 q1, q11, q12, #10 + vdup.8 d16, d26[6] + vext.8 q4, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d18, d26[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d22, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d23, d18 + + vdup.8 d12, d27[0] + vext.8 q0, q11, q12, #2 + vdup.8 d14, d27[1] + vext.8 q1, q11, q12, #4 + vdup.8 d16, d27[2] + vext.8 q4, q11, q12, #6 + vdup.8 d18, d27[3] + vext.8 q5, q11, q12, #8 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d27[4] + vext.8 q0, q11, q12, #10 + vdup.8 d14, d27[5] + vext.8 q1, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d16, d27[6] + vdup.8 d18, d27[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vext.8 q5, q11, q12, #2 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d22, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d23, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[0] + vext.8 q0, q11, q12, #4 + vdup.8 d14, d28[1] + vext.8 q1, q11, q12, #6 + vdup.8 d16, d28[2] + vext.8 q4, q11, q12, #8 + vdup.8 d18, d28[3] + vext.8 q5, q11, q12, #10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[4] + vext.8 q0, q11, q12, #12 + vmovl.s8 q6, d12 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q3, d1, d12 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + push {r10-r11,lr} +1: + mov r10, #80 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_2 d0 + subs r1, r1, #1 + vst1.32 {d0[0]}, [r0]! + bgt 1b + pop {r10-r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r10-r11,lr} +1: + mov r10, #40 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_4 d0 + subs r1, r1, #1 + vst1.16 {d0}, [r0] + add r0, r0, #GRAIN_WIDTH*2-80 + bgt 1b + pop {r10-r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.16 {q3}, [r11]! +gen_grain_uv_lag0_8_start: + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 +gen_grain_uv_lag0_8_add: + vand q3, q3, q1 + vmull.s16 q2, d6, d22 + vmull.s16 q3, d7, d22 + vrshl.s32 q2, q2, q12 + vrshl.s32 q3, q3, q12 + vqmovn.s32 d4, q2 + vqmovn.s32 d5, q3 + vqadd.s16 q2, q2, q0 + vmin.s16 q2, q2, q9 + vmax.s16 q2, q2, q10 + vst1.16 {q2}, [r0]! + pop {r11,pc} +endfunc + +function gen_grain_uv_420_lag0_8_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2,q3}, [r11]! + vld1.16 {q4,q5}, [r12] + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d8, d8, d9 + vpadd.i16 d9, d10, d11 + vadd.i16 q2, q2, q4 + vrshr.s16 q3, q2, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + vld1.16 {q2,q3}, [r11]! + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vrshr.s16 q3, q2, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2}, [r11] + vld1.16 {q0}, [r12] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vpadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d0 + vrshr.s16 d6, d4, #2 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + vld1.16 {q2}, [r11] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vrshr.s16 d6, d4, #1 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + ldr r4, [sp, #36] + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH*2 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 +.else + clz lr, r2 +.endif + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr // bitdepth_min_8 + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #2 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #4 + vst1.32 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 +.ifc \type, uv_444 + vmovl.s8 q6, d13 +.endif + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + + ldr r4, [sp, #36] + mov r12, r3 + movw r11, #(3*GRAIN_WIDTH-3)*2 + mov lr, #28 + add r11, r1, r11 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 + + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #14 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_\type\()_lag0_8_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + vmov q1, q14 + bl gen_grain_\type\()_lag0_4_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 + vmovl.s8 q6, d13 + + set_height r1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off + vmov.u16 r11, \src1[0+\off] + vmov.u16 r12, \src3[0+\off] + add r11, r11, r3 + vmov.u16 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u16 r11, \src3[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u16 r12, \src2[0+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u16 lr, \src4[0+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u16 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u16 r12, \src4[2+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, d2, d3, 0 + gather_interleaved d8, d9, d0, d1, d2, d3, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, lsl #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH*2 // grain_lut stride + ldr r10, [sp, #124] // bitdepth_max + + eor r4, r4, #15 // 15 - scaling_shift + vdup.16 q6, r10 // bitdepth_max + clz r10, r10 + vdup.16 q13, r4 // 15 - scaling_shift + rsb r10, r10, #24 // bitdepth_min_8 + cmp r8, #0 + vdup.16 q12, r10 // bitdepth_min_8 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i16 q14, #16 + vmov.i16 q15, #235 + vshl.s16 q14, q14, q12 + vshl.s16 q15, q15, q12 + b 2f +1: + // no clip + vmov.i16 q14, #0 + vmov q15, q6 +2: + vshr.u16 q6, q6, #1 // grain_max + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #18 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.16 d14, d24[0] + vdup.16 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + sub r2, r2, #32 // src_stride -= 32 + sub r9, r9, #32 // grain_stride -= 32 + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.16 {d0}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r6]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r8], r9 // grain_lut top old +.endif +.if \oy + vld1.16 {q4, q5}, [r6], r9 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r1, :128]! // src +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if !\ox && !\oy + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif +.if !\oy + vmvn.i16 q5, #0xf000 // 0x0fff +.endif + vld1.16 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vmlal.s16 q0, d16, d25 +.endif + +.if \oy +.if \ox + add r8, r8, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vmvn d0, d12 // grain_min + vqrshrn.s32 d4, q1, #5 + vmin.s16 d16, d16, d12 + vmin.s16 d4, d4, d12 + vmax.s16 d16, d16, d0 + vmax.s16 d4, d4, d0 +.endif + + vmull.s16 q0, d4, d14 + vmull.s16 q1, d5, d14 + vmull.s16 q2, d6, d14 + vmull.s16 q3, d7, d14 + vmlal.s16 q0, d16, d15 + vmlal.s16 q1, d17, d15 + vmlal.s16 q2, d18, d15 + vmlal.s16 q3, d19, d15 + vmull.s16 q8, d20, d15 + vmull.s16 q9, d21, d15 + vmull.s16 q10, d22, d15 + vmull.s16 q11, d23, d15 + vmlal.s16 q8, d8, d14 + vmlal.s16 q9, d9, d14 + vmlal.s16 q10, d10, d14 + vmlal.s16 q11, d11, d14 + vmvn q4, q6 // grain_min + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q6 + vmin.s16 q9, q1, q6 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 q10, q2, q6 + vmin.s16 q11, q3, q6 + vmax.s16 q8, q8, q4 + vmax.s16 q9, q9, q4 + vld1.16 {q2, q3}, [r1, :128], r2 // src + vmvn.i16 q5, #0xf000 // 0x0fff + vmax.s16 q10, q10, q4 + vmax.s16 q11, q11, q4 +.elseif \ox + vmvn d4, d12 // grain_min + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 d16, d16, d12 + vmax.s16 d16, d16, d4 + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q5 + vand q1, q1, q5 + vand q2, q2, q5 + vand q3, q3, q5 + + bl gather32_neon + +.if \ox || \oy + vpush {q6-q7} +.endif + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + +.if \ox || \oy + vpop {q6-q7} +.endif + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + + vst1.16 {q0, q1}, [r0, :128]! // dst + subs r7, r7, #1 +.if \oy + vdup.16 d14, d25[0] + vdup.16 d15, d25[1] +.endif + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r10, r11, [sp, #124] // uv, is_id + ldr r6, [sp, #136] // bitdepth_max + + clz r7, r6 + rsb r7, r7, #24 // bitdepth_min_8 + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset + vld1.16 {d30[]}, [r12] // uv_luma_mult + lsl r10, r10, r7 // uv_offset << bitdepth_min_8 + vld1.16 {d30[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + eor lr, lr, #15 // 15 - scaling_shift + + vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 + + cmp r12, #0 + vdup.16 q13, lr // 15 - scaling_shift + + beq 1f + // clip + cmp r11, #0 + mov r8, #16 + mov r9, #240 + lsl r8, r8, r7 + lsl r9, r9, r7 + beq 2f + // is_id + mov r9, #235 + lsl r9, r9, r7 + b 2f +1: + // no clip + mov r8, #0 + mov r9, r6 // bitdepth_max +2: + vmov.16 d30[3], r6 // bitdepth_max + vdup.16 d31, r8 // clip_min + + mov r10, #GRAIN_WIDTH*2 // grain_lut stride + +.if \sy + mov r6, #23 + mov r7, #22 +.else + mov r6, #27 + mov r7, #17 +.endif + vmov.16 d31[1], r9 // clip_max + + ldrd r8, r9, [sp, #116] // offsets, h + + add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + vmov.16 d31[2], r6 // overlap y [0] + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + vmov.16 d31[3], r7 // overlap y [1] + + add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + sub r7, r7, #32 // luma_stride -= 32 + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + sub r2, r2, #32 // src_stride -= 32 + sub r10, r10, #32 // grain_stride -= 32 +.if \oy + mov r12, lr +.endif +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r8]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if \oy + vld1.16 {q4, q5}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif + vld1.16 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + add r11, r11, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vmull.s16 q8, d20, d29 + vmull.s16 q9, d21, d29 + vmull.s16 q10, d22, d29 + vmull.s16 q11, d23, d29 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vmlal.s16 q8, d8, d28 + vmlal.s16 q9, d9, d28 + vmlal.s16 q10, d10, d28 + vmlal.s16 q11, d11, d28 + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q7 + vmin.s16 q9, q1, q7 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q10, q2, q7 + vmin.s16 q11, q3, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 q10, q10, q6 + vmax.s16 q11, q11, q6 +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q4, q5}, [r1, :128]! // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d8, d29 + vmlal.s16 q7, d9, d29 + vmlal.s16 q0, d10, d29 + vmlal.s16 q1, d11, d29 + vld1.16 {q4, q5}, [r1, :128] // src + sub r1, r1, #32 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 + vmlal.s16 q0, d8, d29 + vmlal.s16 q1, d9, d29 + vmlal.s16 q2, d10, d29 + vmlal.s16 q3, d11, d29 + vdup.16 q14, d30[2] // uv_offset + vshrn.s32 d0, q0, #6 + vshrn.s32 d1, q1, #6 + vshrn.s32 d2, q2, #6 + vshrn.s32 d3, q3, #6 + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vadd.i16 q2, q0, q14 + vadd.i16 q3, q1, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmin.s16 q2, q2, q4 + vmin.s16 q3, q3, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 + vmax.s16 q2, q2, q5 + vmax.s16 q3, q3, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 + vand q2, q2, q14 + vand q3, q3, q14 +.endif + + bl gather32_neon + + vld1.16 {q0, q1}, [r1, :128]! // src + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vld1.16 {q2, q3}, [r1, :128], r2 // src + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + +.if \oy + vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmax.s16 q2, q2, q4 + vmax.s16 q3, q3, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + vmin.s16 q2, q2, q5 + vmin.s16 q3, q3, q5 + + vst1.16 {q0, q1}, [r0, :128]! // dst + + subs r9, r9, #1 +.if \oy + vmov.32 d31[1], lr // new coeffs for overlap y +.endif + + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if \oy + vld1.16 {q2, q3}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5], r10 // grain_lut +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d17, q1, #5 + vqrshrn.s32 d18, q2, #5 + vqrshrn.s32 d19, q3, #5 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q8, q8, q7 + vmin.s16 q9, q9, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vrshr.u16 q0, q0, #1 + vrshr.u16 q1, q1, #1 +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q2, q3}, [r1, :128], r2 // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d4, d29 + vmlal.s16 q7, d5, d29 + vmlal.s16 q0, d6, d29 + vmlal.s16 q1, d7, d29 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vdup.16 q14, d30[2] // uv_offset + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + vld1.16 {q2, q3}, [r1, :128], r2 // src + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 +.endif + + bl gather16_neon + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q2, q8 // *src + noise + vqadd.s16 q1, q3, q9 + +.if \oy + // Swap the two last coefficients of d31, place them first in d28 + vrev64.16 d28, d31 +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + + subs r9, r9, #1 +.if \oy + // Take the first two 16 bit coefficients of d28 and place them at the + // end of d31 + vtrn.32 d31, d28 +.endif + + vst1.16 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/ipred.S b/third_party/dav1d/src/arm/32/ipred.S new file mode 100644 index 0000000000..ff55d95d4a --- /dev/null +++ b/third_party/dav1d/src/arm/32/ipred.S @@ -0,0 +1,2937 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * Copyright © 2019, B Krishnan Iyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 + push {r4, lr} + ldr r4, [sp, #8] + clz r3, r3 + adr r2, L(ipred_dc_128_tbl) + sub r3, r3, #25 + ldr r3, [r2, r3, lsl #2] + vmov.i8 q0, #128 + add r2, r2, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r2 + + .align 2 +L(ipred_dc_128_tbl): + .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4, pc} +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4, pc} +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vmov.i8 q1, #128 +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vmov.i8 q1, #128 + sub r1, r1, #32 +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 + push {r4, lr} + ldr lr, [sp, #8] + clz r3, r3 + adr r4, L(ipred_v_tbl) + sub r3, r3, #25 + ldr r3, [r4, r3, lsl #2] + add r2, r2, #1 + add r4, r4, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r4 + + .align 2 +L(ipred_v_tbl): + .word 640f - L(ipred_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_v_tbl) + CONFIG_THUMB +40: + vld1.32 {d0[]}, [r2] +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs lr, lr, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4, pc} +80: + vld1.8 {d0}, [r2] +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs lr, lr, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4, pc} +160: + vld1.8 {q0}, [r2] +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vld1.8 {q0, q1}, [r2] +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vld1.8 {q0, q1}, [r2]! + sub r1, r1, #32 + vld1.8 {q2, q3}, [r2] +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + subs lr, lr, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_h_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + sub r2, r2, #4 + mov lr, #-4 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_h_tbl): + .word 640f - L(ipred_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_h_tbl) + CONFIG_THUMB + .word 8f - L(ipred_h_tbl) + CONFIG_THUMB + .word 4f - L(ipred_h_tbl) + CONFIG_THUMB +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr + vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d2[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4-r5, pc} +8: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr + vst1.8 {d3}, [r0, :64], r1 + vst1.8 {d2}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d1}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + add r2, r2, #3 + mov lr, #-1 +16: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128], r1 + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128], r1 + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + add r2, r2, #3 + mov lr, #-1 + sub r1, r1, #16 +32: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128]! + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r12, :128], r1 + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + add r2, r2, #3 + mov lr, #-1 + sub r1, r1, #48 +64: + vld1.8 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.8 {d2[], d3[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vld1.8 {d4[], d5[]}, [r2], lr + vst1.8 {q1}, [r12, :128]! + vld1.8 {d6[], d7[]}, [r2], lr + vst1.8 {q0}, [r0, :128]! + vst1.8 {q1}, [r12, :128]! + vst1.8 {q0}, [r0, :128]! + vst1.8 {q1}, [r12, :128]! + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r12, :128], r1 + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128]! + vst1.8 {q3}, [r12, :128]! + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_dc_top_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + add r2, r2, #1 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_top_tbl): + .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB +40: + vld1.32 {d0[]}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d0, d0[0] +4: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 4b + pop {r4-r5, pc} +80: + vld1.8 {d0}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #3 + vdup.8 d0, d0[0] +8: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + vld1.8 {d0, d1}, [r2] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] +16: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + vld1.8 {d0, d1, d2, d3}, [r2] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d4, q0, #5 + vdup.8 q0, d4[0] + vdup.8 q1, d4[0] +32: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + vld1.8 {d0, d1, d2, d3}, [r2]! + vaddl.u8 q0, d0, d1 + vld1.8 {d4, d5, d6, d7}, [r2] + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d18, q0, #6 + vdup.8 q0, d18[0] + vdup.8 q1, d18[0] + sub r1, r1, #32 +64: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + sub r2, r2, r4 + clz r3, r3 + clz lr, r4 + sub lr, lr, #25 + adr r5, L(ipred_dc_left_tbl) + sub r3, r3, #20 + ldr r3, [r5, r3, lsl #2] + ldr lr, [r5, lr, lsl #2] + add r3, r5, r3 + add r5, r5, lr + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_left_tbl): + .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + +L(ipred_dc_left_h4): + vld1.32 {d0[]}, [r2, :32] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w4): + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt L(ipred_dc_left_w4) + pop {r4-r5, pc} +L(ipred_dc_left_h8): + vld1.8 {d0}, [r2, :64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #3 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w8): + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt L(ipred_dc_left_w8) + pop {r4-r5, pc} +L(ipred_dc_left_h16): + vld1.8 {d0, d1}, [r2, :128] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w16): + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt L(ipred_dc_left_w16) + pop {r4-r5, pc} +L(ipred_dc_left_h32): + vld1.8 {d0, d1, d2, d3}, [r2, :128] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #5 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w32): + vmov.8 q1, q0 +1: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h64): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vld1.8 {d4, d5, d6, d7}, [r2, :128] + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshrn.u16 d0, q0, #6 + vdup.8 q0, d0[0] + bx r3 +L(ipred_dc_left_w64): + vmov.8 q1, q0 + sub r1, r1, #32 +1: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + sub r2, r2, r4 + add lr, r3, r4 // width + height + clz r3, r3 + clz r12, r4 + vdup.16 q15, lr // width + height + adr r5, L(ipred_dc_tbl) + rbit lr, lr // rbit(width + height) + sub r3, r3, #20 // 25 leading bits, minus table offset 5 + sub r12, r12, #25 + clz lr, lr // ctz(width + height) + ldr r3, [r5, r3, lsl #2] + ldr r12, [r5, r12, lsl #2] + neg lr, lr // -ctz(width + height) + add r3, r5, r3 + add r5, r5, r12 + vshr.u16 q15, q15, #1 // (width + height) >> 1 + vdup.16 q14, lr // -ctz(width + height) + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_tbl): + .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB + +L(ipred_dc_h4): + vld1.32 {d0[]}, [r2, :32]! + vpaddl.u8 d0, d0 + add r2, r2, #1 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w4): + vld1.32 {d1[]}, [r2] + vadd.s16 d0, d0, d30 + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1 + cmp r4, #4 + vadd.s16 d0, d0, d1 + vshl.u16 d0, d0, d28 + beq 1f + // h = 8/16 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + cmp r4, #16 + it ne + movne lr, r5 + vdup.16 d30, lr + vqdmulh.s16 d0, d0, d30 +1: + vdup.8 d0, d0[0] +2: + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + subs r4, r4, #4 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[0]}, [r12, :32], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h8): + vld1.8 {d0}, [r2, :64]! + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + add r2, r2, #1 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w8): + vld1.8 {d2}, [r2] + vadd.s16 d0, d0, d30 + vpaddl.u8 d2, d2 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #8 + vadd.s16 d0, d0, d2 + vshl.u16 d0, d0, d28 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d0, d0, d24 +1: + vdup.8 d0, d0[0] +2: + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d0}, [r12, :64], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h16): + vld1.8 {d0, d1}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + add r2, r2, #1 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w16): + vld1.8 {d2, d3}, [r2] + vadd.s16 d0, d0, d30 + vaddl.u8 q1, d2, d3 + vadd.u16 d2, d2, d3 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #16 + vadd.s16 d0, d0, d2 + vshl.u16 d0, d0, d28 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d0, d0, d24 +1: + vdup.8 q0, d0[0] +2: + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1}, [r0, :128], r1 + vst1.8 {d0, d1}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h32): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + add r2, r2, #1 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w32): + vld1.8 {d2, d3, d4, d5}, [r2] + vadd.s16 d0, d0, d30 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vadd.u16 q1, q1, q2 + vadd.u16 d2, d2, d3 + vpadd.u16 d2, d2 + vpadd.u16 d2, d2 + cmp r4, #32 + vadd.s16 d0, d0, d2 + vshl.u16 d4, d0, d28 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #(0x3334/2) + movw r5, #(0x5556/2) + it ne + movne lr, r5 + vdup.16 d24, lr + vqdmulh.s16 d4, d4, d24 +1: + vdup.8 q0, d4[0] + vdup.8 q1, d4[0] +2: + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h64): + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vld1.8 {d4, d5, d6, d7}, [r2, :128]! + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q1, q2, q3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + add r2, r2, #1 + vpadd.u16 d0, d0 + bx r3 +L(ipred_dc_w64): + vld1.8 {d2, d3, d4, d5}, [r2]! + vadd.s16 d0, d0, d30 + vaddl.u8 q2, d4, d5 + vaddl.u8 q1, d2, d3 + vadd.u16 d4, d4, d5 + vadd.u16 d2, d2, d3 + vld1.8 {d16, d17, d18, d19}, [r2] + vpadd.u16 d4, d4 + vpadd.u16 d2, d2 + vpadd.u16 d4, d4 + vpadd.u16 d2, d2 + vaddl.u8 q8, d16, d17 + vaddl.u8 q9, d18, d19 + vadd.u16 d16, d16, d17 + vadd.u16 d18, d18, d19 + vpadd.u16 d16, d16 + vpadd.u16 d18, d18 + vpadd.u16 d16, d16 + vpadd.u16 d18, d18 + vadd.u16 d2, d2, d4 + vadd.u16 d3, d16, d18 + cmp r4, #64 + vadd.s16 d0, d0, d2 + vadd.s16 d0, d0, d3 + vshl.u16 d18, d0, d28 + beq 1f + // h = 16/32 + movw lr, #(0x5556/2) + movt lr, #(0x3334/2) + and r5, r4, #31 + lsr lr, lr, r5 + vdup.16 d30, lr + vqdmulh.s16 d18, d18, d30 +1: + sub r1, r1, #32 + vdup.8 q0, d18[0] + vdup.8 q1, d18[0] +2: + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.8 {d0, d1, d2, d3}, [r0, :128]! + vst1.8 {d0, d1, d2, d3}, [r12, :128]! + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +endfunc + +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + clz lr, r3 + adr r5, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[], d5[]}, [r2] + add r8, r2, #1 + sub r2, r2, #4 + add r5, r5, lr + mov r7, #-4 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[], d7[]}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vzip.32 d0, d1 + vzip.32 d2, d3 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d2 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vmov d1, d2 + vabd.u8 q10, q3, q9 // tdiff + vabd.u8 q11, q2, q9 // tldiff + vabd.u8 q9, q0, q9 // ldiff + vmin.u8 q12, q10, q11 // min(tdiff, tldiff) + vcge.u8 q10, q11, q10 // tldiff >= tdiff + vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff + vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft + vbit q10, q0, q9 // ldiff <= min ? left : ... + vst1.32 {d21[1]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d20[1]}, [r0, :32], r1 + vst1.32 {d20[0]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 +8: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.8 {d25}, [r0, :64], r1 + vst1.8 {d24}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d23}, [r0, :64], r1 + vst1.8 {d22}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vld1.8 {d6}, [r8]! + mov r12, r3 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 +1: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 +2: + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.8 {d25}, [r0, :64]! + vst1.8 {d24}, [r6, :64]! + vst1.8 {d23}, [r5, :64]! + vst1.8 {d22}, [lr, :64]! + ble 8f + vld1.8 {d6}, [r8]! + b 2b +8: + subs r4, r4, #4 + ble 9f + // End of horizontal loop, move pointers to next four rows + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + vld1.8 {d6}, [r8]! + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + pop {r4-r8, pc} +endfunc + +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.8 {d4[]}, [lr] // bottom + add r8, r2, #1 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d16[]}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.8 q3, d16[3] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vzip.32 d1, d0 // left, flipped + vzip.32 d3, d2 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q0, d1, d6 // left-right + vsubl.u8 q1, d3, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q0, q9 // (left flipped) + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vst1.32 {d24[0]}, [r0, :32], r1 + vst1.32 {d24[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d25[0]}, [r0, :32], r1 + vst1.32 {d25[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.8 {d16}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #2 + mov r7, #-2 + vdup.8 q3, d16[7] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +8: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r4, r4, #2 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3 + sub r2, r2, #2 + mov r7, #-2 + vld1.8 {d6[], d7[]}, [lr] // right + sub r1, r1, r3 + mov r9, r3 + +1: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d16}, [r8]! // top + vld1.8 {d18}, [r10, :64]! // weights_hor + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q8, d16, d4 // top-bottom + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r3, r3, #8 + vst1.8 {d24}, [r0, :64]! + vst1.8 {d25}, [r6, :64]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // bottom + add r2, r2, #1 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vshll.i8 q10, d4, #8 // bottom*256 + vshll.i8 q11, d4, #8 + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + subs r4, r4, #4 + vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q11, q3, q9 + vrshrn.i16 d20, q10, #8 + vrshrn.i16 d21, q11, #8 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r6, :32], r1 + vst1.32 {d21[0]}, [r0, :32], r1 + vst1.32 {d21[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.8 {d6}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.i8 q12, d4, #8 // bottom*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q3, q9 + vmla.i16 q14, q3, q10 + vmla.i16 q15, q3, q11 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vmovl.u8 q4, d8 // weights_ver + vmovl.u8 q5, d10 + vmovl.u8 q6, d12 + vmovl.u8 q7, d14 +2: + vld1.8 {q3}, [r2]! // top + vshll.i8 q8, d4, #8 // bottom*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vsubl.u8 q0, d6, d4 // top-bottom + vsubl.u8 q1, d7, d4 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q9, q1, q4 + vmla.i16 q10, q0, q5 + vmla.i16 q11, q1, q5 + vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q1, q6 + vmla.i16 q14, q0, q7 + vmla.i16 q15, q1, q7 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vzip.32 d3, d2 // left, flipped + vzip.32 d1, d0 + vsubl.u8 q1, d3, d4 // left-right + vsubl.u8 q0, d1, d4 + subs r4, r4, #4 + vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q0, q3 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left + vshll.i8 q12, d4, #8 // right*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vsubl.u8 q11, d22, d4 // left-right + vsubl.u8 q10, d20, d4 + vsubl.u8 q9, d18, d4 + vsubl.u8 q8, d16, d4 + vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q10, q3 // (left flipped) + vmla.i16 q14, q9, q3 + vmla.i16 q15, q8, q3 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #4 + mov r7, #-4 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left + vsubl.u8 q4, d8, d4 // left-right + vsubl.u8 q5, d10, d4 + vsubl.u8 q6, d12, d4 + vsubl.u8 q7, d14, d4 +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vmovl.u8 q0, d2 // weights_hor + vmovl.u8 q1, d3 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q7, q1 // (left flipped) + vmla.i16 q10, q6, q0 + vmla.i16 q11, q6, q1 + vmla.i16 q12, q5, q0 + vmla.i16 q13, q5, q1 + vmla.i16 q14, q4, q0 + vmla.i16 q15, q4, q1 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 + push {r4-r8, lr} + movw r12, #511 + ldrd r4, r5, [sp, #24] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + add r8, r2, #1 + sub r2, r2, #2 + mov r7, #-2 + bx r5 + + .align 2 +L(ipred_filter_tbl): + .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB + +40: + vld1.32 {d0[]}, [r8] // top (0-3) + vmovl.u8 q0, d0 // top (0-3) +4: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s16 d4, q2, #4 + subs r4, r4, #2 + vst1.32 {d4[0]}, [r0, :32], r1 + vmovl.u8 q0, d4 + vst1.32 {d4[1]}, [r6, :32], r1 + vmov d0, d1 // move top from [4-7] to [0-3] + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d0}, [r8] // top (0-7) + vmovl.u8 q0, d0 // top (0-7) +8: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d4, q2, #4 + vmovl.u8 q1, d4 // first block, in 16 bit + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6) + vqrshrun.s16 d5, q3, #4 + vzip.32 d4, d5 + subs r4, r4, #2 + vst1.8 {d4}, [r0, :64], r1 + vmovl.u8 q0, d5 + vst1.8 {d5}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: + vpush {q4-q5} + sub r1, r1, r3 + mov lr, r3 + +1: + vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2) + vmovl.u8 q0, d0 // left (0-1) + topleft (2) +2: + vld1.8 {q2}, [r8]! // top(0-15) + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmovl.u8 q1, d4 // top(0-7) + vmovl.u8 q2, d5 // top(8-15) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d6, q3, #4 + vmovl.u8 q0, d6 // first block, in 16 bit + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d7, q4, #4 + vmovl.u8 q0, d7 // second block, in 16 bit + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d8, q5, #4 + vmovl.u8 q0, d8 // third block, in 16 bit + vmov.u8 r12, d5[6] + vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6) + vmov.8 d0[4], r12 + + subs r3, r3, #16 + vqrshrun.s16 d9, q15, #4 + + vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]! + vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]! + ble 8f + vmov.u8 r12, d9[7] + vmov.8 d0[0], r12 + vmov.u8 r12, d9[3] + vmov.8 d0[2], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q5} + pop {r4-r8, pc} +endfunc + +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 + push {r4-r5, lr} + ldrd r4, r5, [sp, #12] + vld1.16 {q0}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmovn.i16 d0, q0 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.32 {d2[0]}, [r0, :32], r1 + vst1.32 {d2[1]}, [r2, :32], r1 + vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d3[1]}, [r2, :32], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.8 {d2}, [r0, :64], r1 + vtbl.8 d4, {d0}, d4 + vst1.8 {d3}, [r2, :64], r1 + vtbl.8 d5, {d0}, d5 + vst1.8 {d4}, [r0, :64], r1 + vst1.8 {d5}, [r2, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vst1.8 {q9}, [r2, :128], r1 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10}, [r0, :128], r1 + vst1.8 {q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 +32: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #32 +64: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128]! + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q0, #128 // dc + vdup.i16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q2, q3}, [r5, :128]! + vmul.i16 q2, q2, q1 // diff = ac * alpha + vmul.i16 q3, q3, q1 + vshr.s16 q8, q2, #15 // sign = diff >> 15 + vshr.s16 q9, q3, #15 + vadd.i16 q2, q2, q8 // diff + sign + vadd.i16 q3, q3, q9 + vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q3, q3, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d5, q3 + vst1.32 {d4[0]}, [r0, :32], r1 + vst1.32 {d4[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d5[0]}, [r0, :32], r1 + vst1.32 {d5[1]}, [r6, :32], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + vld1.16 {q10, q11}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + vst1.8 {d16}, [r0, :64], r1 + vst1.8 {d17}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d18}, [r0, :64], r1 + vst1.8 {d19}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + add r12, r5, r3, lsl #1 + sub r1, r1, r3 + mov lr, r3 +1: + vld1.16 {q8, q9}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vld1.16 {q10, q11}, [r12, :128]! + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + subs r3, r3, #16 + vst1.16 {q8}, [r0, :128]! + vst1.16 {q9}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #1 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.32 {d0[]}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.8 {d0}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.8 {q0}, [r2] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.8 {q2, q3}, [r2] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + sub r2, r2, r4 + clz lr, r3 + clz r8, r4 + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.32 {d0[]}, [r2, :32] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.8 {d0}, [r2, :64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.8 {q0}, [r2, :128] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.8 {q2, q3}, [r2, :128] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + sub r2, r2, r4 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.16 d16, r8 // width + height + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u16 d16, d16, #1 // (width + height) >> 1 + vdup.16 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.32 {d0[]}, [r2, :32]! + vpaddl.u8 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w4): + vld1.32 {d1[]}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1 + cmp r4, #4 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + cmp r4, #16 + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.8 {d0}, [r2, :64]! + vpaddl.u8 d0, d0 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w8): + vld1.8 {d1}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.i16 d1, d1 + vpadd.i16 d1, d1 + cmp r4, #8 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.8 {q0}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w16): + vld1.8 {q2}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #16 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.8 {q2, q3}, [r2, :128]! + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w32): + vld1.8 {q2, q3}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q2, q2, q3 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #32 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d2}, [r12, :64], r2 + vld1.8 {d1}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vadd.i16 q8, q8, q0 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q8, q8, q1 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i16 q0, q8, q9 + vadd.i16 q1, q10, q11 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + vadd.i32 q0, q1 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +L(ipred_cfl_ac_420_w4_subtract_dc): +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q2, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d1}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q12, q13}, [r1, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q12, q12 + vpaddl.u8 q13, q13 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + vshl.i16 q2, q12, #1 + vshl.i16 q3, q13, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vldr d26, [r1, #16] + vpaddl.u8 q0, q0 + vld1.8 {q12}, [r1, :128], r2 + vpaddl.u8 d6, d6 + vldr d30, [r12, #16] + vpaddl.u8 q2, q2 + vld1.8 {q14}, [r12, :128], r2 + vpaddl.u8 d26, d26 + vpaddl.u8 q12, q12 + vpaddl.u8 d30, d30 + vpaddl.u8 q14, q14 + vadd.i16 d2, d2, d6 + vadd.i16 q0, q0, q2 + vadd.i16 d26, d26, d30 + vadd.i16 q12, q12, q14 + vshl.i16 d2, d2, #1 + vshl.i16 q0, q0, #1 + vshl.i16 d6, d26, #1 + vshl.i16 q2, q12, #1 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q2, q2, #1 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d4}, [r1, :64], r2 + vpaddl.u8 q0, q0 + vld1.8 {d5}, [r12, :64], r2 + vpaddl.u8 q2, q2 + vadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d5 + vshl.i16 d0, d0, #1 + vshl.i16 d4, d4, #1 + vdup.16 q1, d0[3] + vdup.16 q3, d4[3] + vdup.16 d1, d0[3] + vdup.16 d5, d4[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vdup.16 d7, d3[3] + vmov d6, d3 + vdup.16 d5, d2[3] + vmov d4, d2 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vpaddl.u8 q0, q0 + vpaddl.u8 d6, d6 + vpaddl.u8 q2, q2 + vshl.i16 d2, d2, #2 + vshl.i16 q0, q0, #2 + vshl.i16 d6, d6, #2 + vshl.i16 q2, q2, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q2, q2 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.32 {d0[]}, [r1, :32], r2 + vld1.32 {d0[1]}, [r12, :32], r2 + vld1.32 {d2[]}, [r1, :32], r2 + vld1.32 {d2[1]}, [r12, :32], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q1, d2, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d4}, [r1, :64], r2 + vshll.u8 q0, d0, #3 + vld1.16 {d6}, [r12, :64], r2 + vshll.u8 q1, d2, #3 + vshll.u8 q2, d4, #3 + vshll.u8 q3, d6, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q3}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d4}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q2, d4, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + vld1.8 {q2, q3}, [r1, :128], r2 + vld1.8 {q13, q14}, [r12, :128], r2 + vshll.u8 q0, d4, #3 + vshll.u8 q1, d5, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vshll.u8 q1, d29, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + vldr d4, [r1, #16] + vld1.8 {q1}, [r1, :128], r2 + vldr d28, [r12, #16] + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q2, d4, #3 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vdup.16 q1, d1[3] + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q0, d27[3] + vdup.16 q1, d27[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d24}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q12, d24, #3 + subs r8, r8, #2 + vdup.16 q1, d1[3] + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q13, d25[3] + vdup.16 q0, d25[3] + vdup.16 q1, d25[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + vpaddl.u16 q0, q8 + vpaddl.u16 q1, q9 + vpaddl.u16 q2, q10 + vpaddl.u16 q3, q11 + vadd.i32 q0, q0, q1 + vadd.i32 q2, q2, q3 + vadd.i32 q0, q0, q2 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] + b L(ipred_cfl_ac_420_w4_subtract_dc) +endfunc diff --git a/third_party/dav1d/src/arm/32/ipred16.S b/third_party/dav1d/src/arm/32/ipred16.S new file mode 100644 index 0000000000..993d9500aa --- /dev/null +++ b/third_party/dav1d/src/arm/32/ipred16.S @@ -0,0 +1,3254 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, B Krishnan Iyer + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + push {r4, lr} + ldr r4, [sp, #8] + ldr r12, [sp, #24] + clz r3, r3 + adr r2, L(ipred_dc_128_tbl) + sub r3, r3, #25 + vdup.16 q0, r12 + ldr r3, [r2, r3, lsl #2] + add r12, r0, r1 + vrshr.u16 q0, q0, #1 + add r2, r2, r3 + lsl r1, r1, #1 + bx r2 + + .align 2 +L(ipred_dc_128_tbl): + .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vmov q1, q0 +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vmov q1, q0 + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vmov q1, q0 + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + push {r4, lr} + ldr lr, [sp, #8] + clz r3, r3 + adr r4, L(ipred_v_tbl) + sub r3, r3, #25 + ldr r3, [r4, r3, lsl #2] + add r2, r2, #2 + add r4, r4, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r4 + + .align 2 +L(ipred_v_tbl): + .word 640f - L(ipred_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs lr, lr, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +80: + vld1.16 {q0}, [r2] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vld1.16 {q0, q1}, [r2] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #32 + vld1.16 {q2, q3}, [r2] +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #96 + vld1.16 {q2, q3}, [r2]! + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2]! +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128]! + vst1.16 {d4, d5, d6, d7}, [r12, :128]! + subs lr, lr, #2 + vst1.16 {d16, d17, d18, d19}, [r0, :128]! + vst1.16 {d16, d17, d18, d19}, [r12, :128]! + vst1.16 {d20, d21, d22, d23}, [r0, :128], r1 + vst1.16 {d20, d21, d22, d23}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_h_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + sub r2, r2, #2 + mov lr, #-2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_h_tbl): + .word 640f - L(ipred_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_h_tbl) + CONFIG_THUMB + .word 8f - L(ipred_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_h_tbl) + CONFIG_THUMB +40: + sub r2, r2, #6 + mov lr, #-8 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d2}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +8: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128], r1 + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + sub r1, r1, #16 +16: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + sub r1, r1, #48 +32: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #2 + vld1.16 {d4[], d5[]}, [r2], lr + vmov q1, q0 + vmov q3, q2 + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_dc_top_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + add r2, r2, #2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_top_tbl): + .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 d0, d0[0] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + vld1.16 {d0, d1}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + vld1.16 {d0, d1, d2, d3}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d4, d0, #4 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d18, q0, #5 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d18, q0, #6 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + sub r2, r2, r4, lsl #1 + clz r3, r3 + clz lr, r4 + sub lr, lr, #25 + adr r5, L(ipred_dc_left_tbl) + sub r3, r3, #20 + ldr r3, [r5, r3, lsl #2] + ldr lr, [r5, lr, lsl #2] + add r3, r5, r3 + add r5, r5, lr + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_left_tbl): + .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + +L(ipred_dc_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w4): + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt L(ipred_dc_left_w4) + pop {r4-r5, pc} +L(ipred_dc_left_h8): + vld1.16 {d0, d1}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w8): + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt L(ipred_dc_left_w8) + pop {r4-r5, pc} +L(ipred_dc_left_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w16): + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w32): + sub r1, r1, #32 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d0, q0, #6 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w64): + sub r1, r1, #96 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + sub r2, r2, r4, lsl #1 + add lr, r3, r4 // width + height + clz r3, r3 + clz r12, r4 + vdup.32 q15, lr // width + height + adr r5, L(ipred_dc_tbl) + rbit lr, lr // rbit(width + height) + sub r3, r3, #20 // 25 leading bits, minus table offset 5 + sub r12, r12, #25 + clz lr, lr // ctz(width + height) + ldr r3, [r5, r3, lsl #2] + ldr r12, [r5, r12, lsl #2] + neg lr, lr // -ctz(width + height) + add r3, r5, r3 + add r5, r5, r12 + vshr.u32 q15, q15, #1 // (width + height) >> 1 + vdup.32 q14, lr // -ctz(width + height) + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_tbl): + .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB + +L(ipred_dc_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w4): + vld1.16 {d2}, [r2] + vadd.i32 d0, d0, d30 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #4 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 d0, d0[0] +2: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h8): + vld1.16 {d0, d1}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w8): + vld1.16 {d2, d3}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #8 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] +2: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w16): + vld1.16 {d2, d3, d4, d5}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 q1, q1, q2 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d1 + vpaddl.u16 d2, d2 + cmp r4, #16 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w32): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2] + vadd.i16 q1, q1, q2 + vadd.i16 q8, q8, q9 + vadd.i16 q1, q1, q8 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #32 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #32 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +L(ipred_dc_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128]! + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + add r2, r2, #2 + vpadd.i32 d0, d0, d0 + bx r3 +L(ipred_dc_w64): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q1, q1, q2 + vld1.16 {d20, d21, d22, d23}, [r2]! + vadd.i16 q8, q8, q9 + vld1.16 {d24, d25, d26, d27}, [r2]! + vadd.i16 q10, q10, q11 + vadd.i16 q12, q12, q13 + vadd.i16 q1, q1, q8 + vadd.i16 q10, q10, q12 + vadd.i16 q1, q1, q10 + vadd.i16 d2, d2, d3 + vpaddl.u16 d2, d2 + vpadd.i32 d2, d2, d2 + cmp r4, #64 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 16/32 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #96 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + push {r4-r6, lr} + vpush {q4} + ldr r4, [sp, #32] + clz lr, r3 + adr r12, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r2] + add r6, r2, #2 + sub r2, r2, #4 + add r12, r12, lr + mov r5, #-4 + add lr, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + sub r2, r2, #4 + mov r5, #-8 + vld1.16 {d6}, [r6] + vsub.i16 d16, d6, d4 // top - topleft + vmov d7, d6 + vmov d17, d16 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5 + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d24}, [lr, :64], r1 + subs r4, r4, #4 + vst1.16 {d23}, [r0, :64], r1 + vst1.16 {d22}, [lr, :64], r1 + bgt 4b + vpop {q4} + pop {r4-r6, pc} +80: +160: +320: +640: + vld1.16 {q3}, [r6]! + mov r12, r3 + sub r1, r1, r3, lsl #1 +1: + vld2.16 {d0[], d2[]}, [r2, :32], r5 + vmov d1, d0 + vmov d3, d2 +2: + vsub.i16 q8, q3, q2 // top - topleft + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q11}, [lr, :128]! + ble 8f + vld1.16 {q3}, [r6]! + b 2b +8: + subs r4, r4, #2 + ble 9f + // End of horizontal loop, move pointers to next two rows + sub r6, r6, r12, lsl #1 + add r0, r0, r1 + add lr, lr, r1 + vld1.16 {q3}, [r6]! + mov r3, r12 + b 1b +9: + vpop {q4} + pop {r4-r6, pc} +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4, lsl #1 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.16 {d4[], d5[]}, [lr] // bottom + add r8, r2, #2 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.16 {d16}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vdup.16 q3, d16[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d19, d4, d6 // bottom+right +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.u16 q12, d19, #8 // (bottom+right)*256 + vshll.u16 q13, d19, #8 + vshll.u16 q14, d19, #8 + vshll.u16 q15, d19, #8 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vsub.i16 q1, q1, q3 // left-right + vsub.i16 q0, q0, q3 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d2, d18 // (left flipped) + vmlal.s16 q14, d1, d18 + vmlal.s16 q15, d0, d18 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d16, d21 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d16, d23 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d27}, [r6, :64], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.16 {q8}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.16 q3, d17[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d3, d4, d6 // bottom+right +8: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r4, r4, #2 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q13}, [r6, :128], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3, lsl #1 + sub r2, r2, #4 + mov r7, #-4 + vld1.16 {d6[], d7[]}, [lr] // right + sub r1, r1, r3, lsl #1 + mov r9, r3 + vadd.i16 d3, d4, d6 // bottom+right + +1: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d18}, [r10, :64]! // weights_hor + vld1.16 {q8}, [r8]! // top + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q8, q8, q2 // top-bottom + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q13}, [r6, :128]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9, lsl #1 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // bottom + add r2, r2, #2 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d6}, [r2] // top + vsub.i16 d6, d6, d4 // top-bottom + vmov d7, d6 +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q11, q3, q9 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {d20}, [r0, :64], r1 + vst1.16 {d21}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d22}, [r0, :64], r1 + vst1.16 {d23}, [r6, :64], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.16 {q3}, [r2] // top + vsub.i16 q3, q3, q2 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vshll.u8 q10, d20, #7 + vshll.u8 q11, d22, #7 + vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q3, q9 + vqrdmulh.s16 q10, q3, q10 + vqrdmulh.s16 q11, q3, q11 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + subs r4, r4, #4 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vshll.u8 q4, d8, #7 // weights_ver << 7 + vshll.u8 q5, d10, #7 + vshll.u8 q6, d12, #7 + vshll.u8 q7, d14, #7 +2: + vld1.16 {q0, q1}, [r2]! // top + vsub.i16 q0, q0, q2 // top-bottom + vsub.i16 q1, q1, q2 + vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + subs r3, r3, #16 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vsub.i16 q0, q0, q2 // left-right + vsub.i16 q1, q1, q2 + subs r4, r4, #4 + vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q0, q3 // (left flipped) + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d16}, [r6, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + vst1.16 {d18}, [r6, :64], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +8: + vld1.16 {d23}, [r2, :64], r7 // left + subs r4, r4, #4 + vsub.i16 d23, d23, d4 // left-right + vdup.16 q8, d23[3] // flip left + vdup.16 q9, d23[2] + vdup.16 q10, d23[1] + vdup.16 q11, d23[0] + vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q9, q3 + vqrdmulh.s16 q10, q10, q3 + vqrdmulh.s16 q11, q11, q3 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #8 + mov r7, #-8 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld1.16 {d15}, [r2, :64], r7 // left + vsub.i16 d15, d15, d4 // left-right + vdup.16 q4, d15[3] // flip left + vdup.16 q5, d15[2] + vdup.16 q6, d15[1] + vdup.16 q7, d15[0] +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + subs r3, r3, #16 + vshll.u8 q0, d2, #7 // weights_hor << 7 + vshll.u8 q1, d3, #7 + vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon, export=1 + movw r12, #511 + ldrd r4, r5, [sp, #88] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter\bpc\()_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + mov r7, #-4 + vdup.16 q15, r8 + add r8, r2, #2 + sub r2, r2, #4 +.if \bpc == 10 + vmov.i16 q7, #0 +.endif + bx r5 + + .align 2 +L(ipred_filter\bpc\()_tbl): + .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r8] // top (0-3) +4: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vqrshrun.s32 d5, q3, #4 +.endif + vmin.s16 q2, q2, q15 + subs r4, r4, #2 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + vmov d0, d5 // move top from [4-7] to [0-3] + bgt 4b + vpop {q4-q7} + pop {r4-r8, pc} +80: + vld1.16 {q0}, [r8] // top (0-7) +8: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 + vmin.s16 q2, q2, q15 + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d5, q3, #4 + vmin.s16 q2, q2, q15 + vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6) + vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d6, q4, #4 + vqrshrun.s32 d7, q5, #4 +.endif + vmin.s16 q3, q3, q15 + vswp d5, d6 + subs r4, r4, #2 + vst1.16 {q2}, [r0, :128], r1 + vmov q0, q3 + vst1.16 {q3}, [r6, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r8, pc} +160: +320: + sub r1, r1, r3, lsl #1 + mov lr, r3 + +1: + vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2) +2: + vld1.16 {q1, q2}, [r8]! // top(0-15) +.if \bpc == 10 + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 + vmin.s16 q3, q3, q15 + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vrshr.s16 q4, q4, #4 + vmax.s16 q4, q4, q7 + vmin.s16 q4, q4, q15 + vmov q0, q4 + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3) + vrshr.s16 q5, q5, #4 + vmax.s16 q5, q5, q7 + vmin.s16 q5, q5, q15 + vmov q0, q5 + vmov.u16 r12, d5[3] + vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + subs r3, r3, #16 + vrshr.s16 q6, q6, #4 +.else + vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4) + vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4) + vqrshrun.s32 d6, q3, #4 + vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2) + vqrshrun.s32 d7, q4, #4 + vmin.s16 q3, q3, q15 + vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6) + vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d8, q5, #4 + vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d9, q6, #4 + vmin.s16 q0, q4, q15 + vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q4, q4, q15 + vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d10, q7, #4 + vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d11, q6, #4 + vmin.s16 q0, q5, q15 + vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q5, q5, q15 + vmov.u16 r12, d5[3] + vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3) + vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + vqrshrun.s32 d12, q1, #4 + subs r3, r3, #16 + vqrshrun.s32 d13, q7, #4 +.endif + vswp q4, q5 +.if \bpc == 10 + vmax.s16 q6, q6, q7 +.endif + vswp d7, d10 + vmin.s16 q6, q6, q15 + + vswp d9, d12 + + vst1.16 {q3, q4}, [r0, :128]! + vst1.16 {q5, q6}, [r6, :128]! + ble 8f + vmov.u16 r12, d13[3] + vmov.16 d0[0], r12 + vmov.u16 r12, d9[3] + vmov.16 d0[1], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + push {r4-r8, lr} + vpush {q4-q7} + movw r12, 0x3ff + ldr r8, [sp, #104] + cmp r8, r12 + ble ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + vld1.16 {q14}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q15, #0x100 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vzip.8 q0, q1 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vtbl.8 d1, {q14}, d1 + vst1.16 {d0}, [r0, :64], r1 + vtbl.8 d2, {q14}, d2 + vst1.16 {d1}, [r2, :64], r1 + vtbl.8 d3, {q14}, d3 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r2, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + // Prefer doing the adds twice, instead of chaining a vmov after + // the add. + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vadd.i8 q3, q2, q2 + vadd.i8 q2, q2, q2 + vzip.8 q0, q1 + vzip.8 q2, q3 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q2, q2, q15 + vtbl.8 d1, {q14}, d1 + vadd.i16 q3, q3, q15 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vst1.16 {q0}, [r0, :128], r1 + vtbl.8 d6, {q14}, d6 + vst1.16 {q1}, [r2, :128], r1 + vtbl.8 d7, {q14}, d7 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r2, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128], r1 + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r2, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128], r1 + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 + sub r1, r1, #32 +32: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r2, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128]! + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vrshr.u16 q0, q15, #1 + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q8, q9}, [r5, :128]! + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + subs r4, r4, #2 + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r6, :128], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + vpush {q4-q7} + add r12, r5, r3, lsl #1 + sub r1, r1, r3, lsl #1 + mov lr, r3 +1: + vld1.16 {q6, q7}, [r5, :128]! + vmull.s16 q2, d12, d2 // diff = ac * alpha + vld1.16 {q8, q9}, [r12, :128]! + vmull.s16 q3, d13, d3 + vmull.s16 q4, d14, d2 + vmull.s16 q5, d15, d3 + vmull.s16 q6, d16, d2 + vmull.s16 q7, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q4, #31 + vshr.s32 q13, q5, #31 + vadd.i32 q2, q2, q10 // diff + sign + vshr.s32 q10, q6, #31 + vadd.i32 q3, q3, q11 + vshr.s32 q11, q7, #31 + vadd.i32 q4, q4, q12 + vshr.s32 q12, q8, #31 + vadd.i32 q5, q5, q13 + vshr.s32 q13, q9, #31 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q4, #6 + vrshrn.i32 d7, q5, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vrshrn.i32 d8, q6, #6 + vrshrn.i32 d9, q7, #6 + vadd.i16 q3, q3, q0 + vrshrn.i32 d10, q8, #6 + vrshrn.i32 d11, q9, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q4, q4, q14 + vmax.s16 q5, q5, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q4, q4, q15 + vmin.s16 q5, q5, q15 + subs r3, r3, #16 + vst1.16 {q2, q3}, [r0, :128]! + vst1.16 {q4, q5}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #2 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.16 {q0}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.16 {q2, q3}, [r2] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + clz lr, r3 + clz r8, r4 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.16 {q0}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.16 {q2, q3}, [r2, :128] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.32 d16, r8 // width + height + vdup.16 q15, r7 // bitdepth_max + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u32 d16, d16, #1 // (width + height) >> 1 + vdup.32 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w4): + vld1.16 {d1}, [r2] + vadd.i32 d0, d0, d16 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #4 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.16 {q0}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w8): + vld1.16 {q2}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #8 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.16 {q2, q3}, [r2, :128]! + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w16): + vld1.16 {q2, q3}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 q2, q2, q3 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #16 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q2, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w32): + vld1.16 {q2, q3}, [r2]! + vadd.i32 d0, d0, d16 + vld1.16 {q10, q11}, [r2]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q2, q2, q10 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #32 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i32 q8, q8, q9 + vadd.i32 q10, q10, q11 + vadd.i32 q0, q8, q10 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q2, q2, q12 + vadd.i16 q3, q3, q13 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vadd.i16 q2, q2, q12 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vshl.i16 q0, q0, #1 + vshl.i16 d2, d2, #1 + subs r8, r8, #1 + vdup.16 d3, d2[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q12, q13}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vshl.i16 q0, q0, #1 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q12}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vpadd.i16 d0, d0, d1 + vshl.i16 d0, d0, #1 + subs r8, r8, #1 + vdup.16 q1, d0[3] + vdup.16 d1, d0[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d26, d27 + vpadd.i16 d26, d4, d5 + vpadd.i16 d27, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q12, #2 + vshl.i16 q3, q13, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d4, d5 + vshl.i16 q0, q0, #2 + vshl.i16 q12, q12, #2 + vdup.16 d7, d25[3] + vmov d6, d25 + vdup.16 d5, d24[3] + vmov d4, d24 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vpadd.i16 d7, d26, d27 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vshl.i16 q0, q0, #2 + vshl.i16 d2, d2, #2 + vshl.i16 q2, q2, #2 + vshl.i16 d6, d6, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d1}, [r12, :64], r2 + vld1.16 {d2}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q2, q2, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + asr r2, r2, #1 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): + sub r2, r2, #32 +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + sub r2, r2, #32 +1: // Copy and expand input, padding 8 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q3, d5[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.16 {q0, q1}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.16 {q0}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S new file mode 100644 index 0000000000..ceea025e45 --- /dev/null +++ b/third_party/dav1d/src/arm/32/itx.S @@ -0,0 +1,3343 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro vmull_vmlal d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlal.s16 \d1, \s3, \c1 +.endm + +.macro vmull_vmlsl d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlsl.s16 \d1, \s3, \c1 +.endm + +.macro vqrshrn_8h d0, d1, s0, s1, shift + vqrshrn.s32 \d0, \s0, \shift + vqrshrn.s32 \d1, \s1, \shift +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s16 \r0, \r0, \c + vqrdmulh.s16 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s16 \r2, \r2, \c + vqrdmulh.s16 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s16 \r4, \r4, \c + vqrdmulh.s16 \r5, \r5, \c + vqrdmulh.s16 \r6, \r6, \c + vqrdmulh.s16 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + vld1.8 {\load}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.8 {\store}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src, \shiftbits + load_add_store d3, q9, , , , , , \dst, \src, \shiftbits + load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits + load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits + load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits + load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits + load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits + load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits + load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits + load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits + load_add_store , , , , , , d3, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src + load_add_store d3, q9, , , , , , \dst, \src + load_add_store d4, q10, d2, q8, , , , \dst, \src + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src + load_add_store , , d4, q10, q9, d3, d2, \dst, \src + load_add_store , , d5, q11, q10, d4, d3, \dst, \src + load_add_store , , , , q11, d5, d4, \dst, \src + load_add_store , , , , , , d5, \dst, \src +.endm +.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + vld1.32 {\load[0]}, [\src, :32], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #4 +.endif +.ifnb \load + vld1.32 {\load[1]}, [\src, :32], r1 +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \store + vst1.32 {\store[0]}, [\dst, :32], r1 +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.32 {\store[1]}, [\dst, :32], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src + load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src + load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src + load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src + load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src + load_add_store4 , , , , q15, d7, d6, \dst, \src + load_add_store4 , , , , , , d7, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src + load_add_store4 , , , , q11, d3, d2, \dst, \src + load_add_store4 , , , , , , d3, \dst, \src +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d0, r12 + vqrdmulh.s16 d16, d16, d0[0] + vst1.16 {d30[0]}, [r2, :16] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s16 d16, d16, d0[0] +.endif +.if \shift > 0 + vrshr.s16 d16, d16, #\shift +.endif + vqrdmulh.s16 d20, d16, d0[0] + mov r3, #\h + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + vld1.32 {d0[0]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[0]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + subs r3, r3, #4 + sub r0, r0, r1, lsl #2 + vaddw.u8 q10, q8, d0 + vqmovun.s16 d0, q10 + vaddw.u8 q11, q8, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q11 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon +1: + vld1.8 {d0}, [r0, :64], r1 + vld1.8 {d1}, [r0, :64], r1 + vld1.8 {d2}, [r0, :64], r1 + vaddw.u8 q10, q8, d0 + vld1.8 {d3}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + subs r3, r3, #4 + vaddw.u8 q11, q8, d1 + vqmovun.s16 d0, q10 + vaddw.u8 q12, q8, d2 + vqmovun.s16 d1, q11 + vaddw.u8 q13, q8, d3 + vst1.8 {d0}, [r0, :64], r1 + vqmovun.s16 d2, q12 + vst1.8 {d1}, [r0, :64], r1 + vqmovun.s16 d3, q13 + vst1.8 {d2}, [r0, :64], r1 + vst1.8 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon +1: + vld1.8 {q0}, [r0, :128], r1 + vld1.8 {q1}, [r0, :128], r1 + vld1.8 {q2}, [r0, :128], r1 + subs r3, r3, #4 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vld1.8 {q3}, [r0, :128], r1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #2 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q1}, [r0, :128], r1 + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon +1: + vld1.8 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.8 {q2, q3}, [r0, :128], r1 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #1 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #32 +1: + vld1.8 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.8 {q2, q3}, [r0, :128] + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, #32 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128]! + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i16 d16, d16, d17 + vsub.i16 d21, d18, d19 + vsub.i16 d20, d16, d21 + vshr.s16 d20, d20, #1 + vsub.i16 d18, d20, d17 + vsub.i16 d17, d20, d19 + vadd.i16 d19, d21, d18 + vsub.i16 d16, d16, d17 +.endm + +.macro idct_4h_x4 r0, r1, r2, r3 + vmull_vmlal q3, \r1, \r3, d0[3], d0[2] + vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] + vmull_vmlal q1, \r0, \r2, d0[0], d0[0] + vqrshrn.s32 d6, q3, #12 + vqrshrn.s32 d7, q2, #12 + vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] + vqrshrn.s32 d2, q1, #12 + vqrshrn.s32 d3, q2, #12 + vqadd.s16 \r0, d2, d6 + vqsub.s16 \r3, d2, d6 + vqadd.s16 \r1, d3, d7 + vqsub.s16 \r2, d3, d7 +.endm + +.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] + vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] + vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] + vqrshrn_8h d12, d13, q6, q7, #12 + vqrshrn_8h d14, d15, q4, q5, #12 + vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] + vqrshrn_8h d4, d5, q2, q3, #12 + vqrshrn_8h d6, d7, q4, q5, #12 + vqadd.s16 \q0, q2, q6 + vqsub.s16 \q3, q2, q6 + vqadd.s16 \q1, q3, q7 + vqsub.s16 \q2, q3, q7 +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_4h_x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q1, d16, d18 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmull.s16 q10, d17, d0[3] + vaddw.s16 q1, q1, d19 + vmull.s16 q3, d16, d0[2] + vmlsl.s16 q3, d18, d0[0] + vmlsl.s16 q3, d19, d0[1] + + vadd.s32 q11, q2, q3 + vmul.s32 q1, q1, d1[0] + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q10 + vsub.s32 q11, q11, q10 + + vqrshrn.s32 \o0, q2, #12 + vqrshrn.s32 \o2, q1, #12 + vqrshrn.s32 \o1, q3, #12 + vqrshrn.s32 \o3, q11, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 d19, d18, d17, d16 + bx lr +endfunc + +.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q2, d16, d20 + vsubl.s16 q3, d17, d21 + vmull.s16 q4, d16, d0[0] + vmlal.s16 q4, d20, d0[1] + vmlal.s16 q4, d22, d0[2] + vmull.s16 q5, d17, d0[0] + vmlal.s16 q5, d21, d0[1] + vmlal.s16 q5, d23, d0[2] + vaddw.s16 q2, q2, d22 + vaddw.s16 q3, q3, d23 + vmull.s16 q6, d16, d0[2] + vmlsl.s16 q6, d20, d0[0] + vmlsl.s16 q6, d22, d0[1] + vmull.s16 q7, d17, d0[2] + vmlsl.s16 q7, d21, d0[0] + vmlsl.s16 q7, d23, d0[1] + + vmul.s32 q10, q2, d1[0] + vmul.s32 q11, q3, d1[0] + + vmull.s16 q2, d18, d0[3] + vmull.s16 q3, d19, d0[3] + + vadd.s32 q8, q4, q2 // out0 + vadd.s32 q9, q5, q3 + + vadd.s32 q4, q4, q6 // out3 + vadd.s32 q5, q5, q7 + + vadd.s32 q6, q6, q2 // out1 + vadd.s32 q7, q7, q3 + + vsub.s32 q4, q4, q2 // out3 + vsub.s32 q5, q5, q3 + + vqrshrn.s32 d20, q10, #12 + vqrshrn.s32 d21, q11, #12 + + vqrshrn.s32 \o0, q8, #12 + vqrshrn.s32 \o1, q9, #12 + +.ifc \o4, d18 + vmov q9, q10 +.endif + + vqrshrn.s32 \o2, q6, #12 + vqrshrn.s32 \o3, q7, #12 + + vqrshrn.s32 \o6, q4, #12 + vqrshrn.s32 \o7, q5, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_identity_4h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q2, q8, d0[0] + vqrdmulh.s16 q3, q9, d0[0] + vqadd.s16 q8, q8, q2 + vqadd.s16 q9, q9, q3 + bx lr +endfunc + +function inv_identity_8h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q1, q8, d0[0] + vqrdmulh.s16 q2, q9, d0[0] + vqrdmulh.s16 q3, q10, d0[0] + vqadd.s16 q8, q8, q1 + vqrdmulh.s16 q1, q11, d0[0] + vqadd.s16 q9, q9, q2 + vqadd.s16 q10, q10, q3 + vqadd.s16 q11, q11, q1 + bx lr +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0, \r1, \r2, \r3 + vqrdmulh.s16 q1, \i, \c + vrhadd.s16 \i, \i, q1 +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + vshr.s16 q8, q8, #2 + vshr.s16 q9, q9, #2 + + iwht4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + iwht4 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + blx r4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + +L(itx_4x4_end): + sub r0, r0, r1, lsl #2 + vaddw.u8 q8, q8, d0 + vqmovun.s16 d0, q8 + vaddw.u8 q9, q9, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q9 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d4, r12 + vst1.16 {d30[0]}, [r2, :16] + vqrdmulh.s16 d16, d16, d4[0] + vld1.32 {d0[0]}, [r0, :32], r1 + vqrdmulh.s16 d20, d16, d4[0] + vld1.32 {d0[1]}, [r0, :32], r1 + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + vld1.32 {d1[0]}, [r0, :32], r1 + vmov q9, q8 + vld1.32 {d1[1]}, [r0, :32], r1 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4h_x4_neon + movrel_local r5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 + + vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a + vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a + vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a + vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a + vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a + + vqadd.s16 q2, \q1, \q3 // t4 + vqsub.s16 \q1, \q1, \q3 // t5a + vqadd.s16 q3, \q7, \q5 // t7 + vqsub.s16 \q3, \q7, \q5 // t6a + + vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 + vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 + vqrshrn_8h d8, d9, q4, q5, #12 // t5 + vqrshrn_8h d10, d11, q6, q7, #12 // t6 + + vqsub.s16 \q7, \q0, q3 // out7 + vqadd.s16 \q0, \q0, q3 // out0 + vqadd.s16 \q1, \q2, q5 // out1 + vqsub.s16 q6, \q2, q5 // out6 + vqadd.s16 \q2, \q4, q4 // out2 + vqsub.s16 \q5, \q4, q4 // out5 + vqadd.s16 \q3, \q6, q2 // out3 + vqsub.s16 \q4, \q6, q2 // out4 + vmov \q6, q6 // out6 +.endm + +.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4h_x4 \r0, \r2, \r4, \r6 + + vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a + vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a + vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a + vqrshrn.s32 \r1, q1, #12 // t4a + vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a + vqrshrn.s32 \r7, q2, #12 // t7a + vqrshrn.s32 \r3, q3, #12 // t5a + vqrshrn.s32 \r5, q1, #12 // taa + + vqadd.s16 d2, \r1, \r3 // t4 + vqsub.s16 \r1, \r1, \r3 // t5a + vqadd.s16 d3, \r7, \r5 // t7 + vqsub.s16 \r3, \r7, \r5 // t6a + + vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 + vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 + vqrshrn.s32 d4, q2, #12 // t5 + vqrshrn.s32 d5, q3, #12 // t6 + + vqsub.s16 \r7, \r0, d3 // out7 + vqadd.s16 \r0, \r0, d3 // out0 + vqadd.s16 \r1, \r2, d5 // out1 + vqsub.s16 d6, \r2, d5 // out6 + vqadd.s16 \r2, \r4, d4 // out2 + vqsub.s16 \r5, \r4, d4 // out5 + vqadd.s16 \r3, \r6, d2 // out3 + vqsub.s16 \r4, \r6, d2 // out4 + vmov \r6, d6 // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] + vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] + vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] + vqrshrn_8h d16, d17, q2, q3, #12 // t0a + vqrshrn_8h d30, d31, q4, q5, #12 // t1a + vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] + vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] + vqrshrn_8h d20, d21, q6, q7, #12 // t2a + vqrshrn_8h d26, d27, q2, q3, #12 // t3a + vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] + vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] + vqrshrn_8h d24, d25, q4, q5, #12 // t4a + vqrshrn_8h d22, d23, q6, q7, #12 // t5a + vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] + vqrshrn_8h d28, d29, q2, q3, #12 // t6a + vqrshrn_8h d18, d19, q4, q5, #12 // t7a + + vqadd.s16 q2, q8, q12 // t0 + vqsub.s16 q3, q8, q12 // t4 + vqadd.s16 q4, q15, q11 // t1 + vqsub.s16 q5, q15, q11 // t5 + vqadd.s16 q6, q10, q14 // t2 + vqsub.s16 q7, q10, q14 // t6 + vqadd.s16 q10, q13, q9 // t3 + vqsub.s16 q11, q13, q9 // t7 + + vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] + vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] + + vqrshrn_8h d6, d7, q8, q9, #12 // t4a + vqrshrn_8h d10, d11, q12, q13, #12 // t5a + + vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] + + vqrshrn_8h d14, d15, q14, q15, #12 // t6a + vqrshrn_8h d22, d23, q8, q9, #12 // t7a + + vqadd.s16 \q0, q2, q6 // out0 + vqsub.s16 q2, q2, q6 // t2 + vqadd.s16 \q7, q4, q10 // out7 + vqsub.s16 q4, q4, q10 // t3 + vqneg.s16 \q7, \q7 // out7 + + vqadd.s16 \q1, q3, q7 // out1 + vqsub.s16 q3, q3, q7 // t6 + vqadd.s16 \q6, q5, q11 // out6 + vqsub.s16 q5, q5, q11 // t7 + vqneg.s16 \q1, \q1 // out1 + + vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) + vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) + vqrshrn_8h d4, d5, q10, q11, #12 // out3 + vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) + vqrshrn_8h d6, d7, q12, q13, #12 // out5 + vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + + vqneg.s16 \q3, q2 // out3 + vqneg.s16 \q5, q3 // out5 +.endm + +.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal q2, d23, d16, d0[0], d0[1] + vmull_vmlsl q3, d23, d16, d0[1], d0[0] + vmull_vmlal q4, d21, d18, d0[2], d0[3] + vqrshrn.s32 d16, q2, #12 // t0a + vqrshrn.s32 d23, q3, #12 // t1a + vmull_vmlsl q5, d21, d18, d0[3], d0[2] + vmull_vmlal q6, d19, d20, d1[0], d1[1] + vqrshrn.s32 d18, q4, #12 // t2a + vqrshrn.s32 d21, q5, #12 // t3a + vmull_vmlsl q7, d19, d20, d1[1], d1[0] + vmull_vmlal q2, d17, d22, d1[2], d1[3] + vqrshrn.s32 d20, q6, #12 // t4a + vqrshrn.s32 d19, q7, #12 // t5a + vmull_vmlsl q3, d17, d22, d1[3], d1[2] + vqrshrn.s32 d22, q2, #12 // t6a + vqrshrn.s32 d17, q3, #12 // t7a + + vqadd.s16 d4, d16, d20 // t0 + vqsub.s16 d5, d16, d20 // t4 + vqadd.s16 d6, d23, d19 // t1 + vqsub.s16 d7, d23, d19 // t5 + vqadd.s16 d8, d18, d22 // t2 + vqsub.s16 d9, d18, d22 // t6 + vqadd.s16 d18, d21, d17 // t3 + vqsub.s16 d19, d21, d17 // t7 + + vmull_vmlal q8, d5, d7, d2[3], d2[2] + vmull_vmlsl q10, d5, d7, d2[2], d2[3] + vmull_vmlsl q11, d19, d9, d2[3], d2[2] + + vqrshrn.s32 d5, q8, #12 // t4a + vqrshrn.s32 d7, q10, #12 // t5a + + vmull_vmlal q8, d19, d9, d2[2], d2[3] + + vqrshrn.s32 d9, q11, #12 // t6a + vqrshrn.s32 d19, q8, #12 // t7a + + vqadd.s16 \r0, d4, d8 // out0 + vqsub.s16 d4, d4, d8 // t2 + vqadd.s16 \r7, d6, d18 // out7 + vqsub.s16 d6, d6, d18 // t3 + vqneg.s16 \r7, \r7 // out7 + + vqadd.s16 \r1, d5, d9 // out1 + vqsub.s16 d5, d5, d9 // t6 + vqadd.s16 \r6, d7, d19 // out6 + vqsub.s16 d7, d7, d19 // t7 + vqneg.s16 \r1, \r1 // out1 + + vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) + vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) + vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) + vqrshrn.s32 d4, q9, #12 // out3 + vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) + vqrshrn.s32 d5, q10, #12 // out5 + vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21) + vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19) + + vqneg.s16 \r3, d4 // out3 + vqneg.s16 \r5, d5 // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_8h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + vqshl.s16 q12, q12, #1 + vqshl.s16 q13, q13, #1 + vqshl.s16 q14, q14, #1 + vqshl.s16 q15, q15, #1 + bx lr +endfunc + +function inv_identity_4h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + bx lr +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blx r4 + + vrshr.s16 q8, q8, #1 + vrshr.s16 q9, q9, #1 + vrshr.s16 q10, q10, #1 + vrshr.s16 q11, q11, #1 + vrshr.s16 q12, q12, #1 + vrshr.s16 q13, q13, #1 + vrshr.s16 q14, q14, #1 + vrshr.s16 q15, q15, #1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + movrel_local r4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +function inv_txfm_add_4x8_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x8h q8, q9, q10, q11 + vswp d17, d20 + vswp d19, d21 + vswp d17, d18 + vswp d19, d22 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0, q1}, [r12, :128] + + vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a + vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a + vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a + vqrshrn.s32 d17, q2, #12 // t8a + vqrshrn.s32 d31, q3, #12 // t15a + vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a + vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a + vqrshrn.s32 d23, q4, #12 // t9a + vqrshrn.s32 d25, q2, #12 // t14a + vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a + vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a + vqrshrn.s32 d21, q3, #12 // t10a + vqrshrn.s32 d27, q4, #12 // t13a + vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a + vqrshrn.s32 d19, q2, #12 // t11a + vqrshrn.s32 d29, q3, #12 // t12a + + idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vqsub.s16 d4, d17, d23 // t9 + vqadd.s16 d17, d17, d23 // t8 + vqsub.s16 d5, d31, d25 // t14 + vqadd.s16 d31, d31, d25 // t15 + vqsub.s16 d23, d19, d21 // t10 + vqadd.s16 d19, d19, d21 // t11 + vqadd.s16 d25, d29, d27 // t12 + vqsub.s16 d29, d29, d27 // t13 + + vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a + vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a + vqrshrn.s32 d21, q3, #12 // t9a + vqrshrn.s32 d27, q4, #12 // t14a + + vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a + vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a + vqrshrn.s32 d29, q3, #12 // t13a + vneg.s32 q4, q4 + vqrshrn.s32 d23, q4, #12 // t10a + + vqsub.s16 d4, d17, d19 // t11a + vqadd.s16 d17, d17, d19 // t8a + vqsub.s16 d5, d31, d25 // t12a + vqadd.s16 d31, d31, d25 // t15a + vqadd.s16 d19, d21, d23 // t9 + vqsub.s16 d21, d21, d23 // t10 + vqsub.s16 d25, d27, d29 // t13 + vqadd.s16 d27, d27, d29 // t14 + + vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 + vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 + vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a + + vqrshrn.s32 d6, q3, #12 // t11 + vqrshrn.s32 d7, q4, #12 // t12 + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a + vqrshrn.s32 d4, q2, #12 // t10a + vqrshrn.s32 d5, q4, #12 // t13a + + vqadd.s16 d8, d16, d31 // out0 + vqsub.s16 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s16 d23, d30, d17 // out7 + vqsub.s16 d9, d30, d17 // out8 + vqadd.s16 d17, d18, d27 // out1 + vqsub.s16 d30, d18, d27 // out14 + vqadd.s16 d18, d20, d5 // out2 + vqsub.s16 d29, d20, d5 // out13 + vqadd.s16 d5, d28, d19 // out6 + vqsub.s16 d25, d28, d19 // out9 + vqadd.s16 d19, d22, d7 // out3 + vqsub.s16 d28, d22, d7 // out12 + vqadd.s16 d20, d24, d6 // out4 + vqsub.s16 d27, d24, d6 // out11 + vqadd.s16 d21, d26, d4 // out5 + vqsub.s16 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.16 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + + vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 + vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 + vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 + vqrshrn.s32 d16, q2, #12 // t0 + vqrshrn.s32 d31, q3, #12 // t1 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 + vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 + vqrshrn.s32 d18, q4, #12 // t2 + vqrshrn.s32 d29, q2, #12 // t3 + vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 + vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 + vqrshrn.s32 d20, q3, #12 // t4 + vqrshrn.s32 d27, q4, #12 // t5 + vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 + vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 + vqrshrn.s32 d22, q2, #12 // t6 + vqrshrn.s32 d25, q3, #12 // t7 + vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 + vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 + vqrshrn.s32 d23, q4, #12 // t8 + vqrshrn.s32 d24, q2, #12 // t9 + vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 + vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 + vqrshrn.s32 d21, q3, #12 // t10 + vqrshrn.s32 d26, q4, #12 // t11 + vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 + vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 + vqrshrn.s32 d19, q2, #12 // t12 + vqrshrn.s32 d28, q3, #12 // t13 + vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 + vqrshrn.s32 d17, q4, #12 // t14 + vqrshrn.s32 d30, q2, #12 // t15 + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d23 // t8a + vqadd.s16 d16, d16, d23 // t0a + vqsub.s16 d3, d31, d24 // t9a + vqadd.s16 d31, d31, d24 // t1a + vqadd.s16 d23, d18, d21 // t2a + vqsub.s16 d18, d18, d21 // t10a + vqadd.s16 d24, d29, d26 // t3a + vqsub.s16 d29, d29, d26 // t11a + vqadd.s16 d21, d20, d19 // t4a + vqsub.s16 d20, d20, d19 // t12a + vqadd.s16 d26, d27, d28 // t5a + vqsub.s16 d27, d27, d28 // t13a + vqadd.s16 d19, d22, d17 // t6a + vqsub.s16 d22, d22, d17 // t14a + vqadd.s16 d28, d25, d30 // t7a + vqsub.s16 d25, d25, d30 // t15a + + vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 + vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 + vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 + vqrshrn.s32 d17, q2, #12 // t8 + vqrshrn.s32 d30, q3, #12 // t9 + vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 + vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 + vqrshrn.s32 d18, q4, #12 // t10 + vqrshrn.s32 d29, q2, #12 // t11 + vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 + vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 + vqrshrn.s32 d27, q3, #12 // t12 + vqrshrn.s32 d20, q4, #12 // t13 + vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 + vqrshrn.s32 d25, q2, #12 // t14 + vqrshrn.s32 d22, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t4 + vqadd.s16 d16, d16, d21 // t0 + vqsub.s16 d3, d31, d26 // t5 + vqadd.s16 d31, d31, d26 // t1 + vqadd.s16 d21, d23, d19 // t2 + vqsub.s16 d23, d23, d19 // t6 + vqadd.s16 d26, d24, d28 // t3 + vqsub.s16 d24, d24, d28 // t7 + vqadd.s16 d19, d17, d27 // t8a + vqsub.s16 d17, d17, d27 // t12a + vqadd.s16 d28, d30, d20 // t9a + vqsub.s16 d30, d30, d20 // t13a + vqadd.s16 d27, d18, d25 // t10a + vqsub.s16 d18, d18, d25 // t14a + vqadd.s16 d20, d29, d22 // t11a + vqsub.s16 d29, d29, d22 // t15a + + vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a + vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a + vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a + vqrshrn.s32 d22, q2, #12 // t4a + vqrshrn.s32 d25, q3, #12 // t5a + vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a + vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 + vqrshrn.s32 d24, q4, #12 // t6a + vqrshrn.s32 d23, q2, #12 // t7a + vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 + vqrshrn.s32 d17, q3, #12 // t12 + vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 + vqrshrn.s32 d29, q4, #12 // t13 + vqrshrn.s32 d30, q2, #12 // t14 + vqrshrn.s32 d18, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s16 \o0, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 +.else + vqadd.s16 d4, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s16 \o15, \o15 // out15 + + vqsub.s16 d3, d29, d18 // t15a + vqadd.s16 \o13,d29, d18 // out13 + vqadd.s16 \o2, d17, d30 // out2 + vqsub.s16 d26, d17, d30 // t14a + vqneg.s16 \o13,\o13 // out13 + + vqadd.s16 \o1, d19, d27 // out1 + vqsub.s16 d27, d19, d27 // t10 + vqadd.s16 \o14,d28, d20 // out14 + vqsub.s16 d20, d28, d20 // t11 + vqneg.s16 \o1, \o1 // out1 + + vqadd.s16 \o3, d22, d24 // out3 + vqsub.s16 d22, d22, d24 // t6 + vqadd.s16 \o12,d25, d23 // out12 + vqsub.s16 d23, d25, d23 // t7 + vqneg.s16 \o3, \o3 // out3 + + vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vqrshrn.s32 d24, q12, #12 // out8 + vqrshrn.s32 d4, q2, #12 // out7 + vqrshrn.s32 d5, q3, #12 // out5 + vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vqrshrn.s32 d26, q4, #12 // out10 + + vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vqrshrn.s32 \o4, q1, #12 // out4 + vqrshrn.s32 d7, q3, #12 // out9 + vqrshrn.s32 d6, q4, #12 // out11 + vqrshrn.s32 \o6, q11, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s16 \o7, d4 // out7 + vqneg.s16 \o5, d5 // out5 + vqneg.s16 \o11,d6 // out11 + vqneg.s16 \o9, d7 // out9 +.endm + +function inv_adst_4h_x16_neon, export=1 + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_4h_x16_neon, export=1 + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q1, \i, d0[0] + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_4x16_shift2 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vshr.s16 q2, q2, #1 + vrhadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_4x16_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vrshr.s16 q2, q2, #1 + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_8x8_shift1 c + identity_4x16_shift1 \c +.endm + +.macro identity_8x8 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + push {lr} + vmov.i16 d7, #0 +.if \identity + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.endif +.if \scale + movw r12, #2896*8 + vdup.16 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif +.if \identity +.if \shift == -2 + identity_4x16_shift2 d0[0] +.else + identity_4x16_shift1 d0[0] +.endif +.else + blx r4 +.endif +.if \shift > 0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #\shift +.endr +.endif + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 + vst1.16 {\i}, [r6, :64]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + blx r9 +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4} +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_identity_16x4_neon +.else + movrel_local r9, inv_txfm_horz_16x4_neon + movrel_local r4, inv_\txfm1\()_4h_x16_neon +.endif + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + +.ifc \variant, identity_ + vmov.i16 d4, #0 +.irp i, d16, d18, d20, d22 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d17, d19, d21, d23 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, d24, d26, d28, d30 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d25, d27, d29, d31 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + + identity_4x16_shift1 d0[0] +.else + vmov.i16 q2, #0 + vmov.i16 q3, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d24, d25, d26, d27}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d28, d29, d30, d31}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + + blx r4 + + vswp d17, d20 + vswp d19, d22 + vswp d18, d20 + vswp d19, d21 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + +.ifc \variant, identity_ + vmov q8, q12 + vmov q9, q13 + vmov q10, q14 + vmov q11, q15 +.else + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 + vrshr.s16 q8, q12, #1 + vrshr.s16 q9, q13, #1 + vrshr.s16 q10, q14, #1 + vrshr.s16 q11, q15, #1 +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + add r6, r0, #8 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_4x16_neon + vmov.i16 q2, #0 + + mov r11, #32 + cmp r3, r10 + blt 1f + + add r6, r2, #16 +.ifc \variant, identity_ +.irp i, q12, q13, q14, q15 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q12, q13, q14, q15, d0[0] +.else +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + blx r4 + vrshr.s16 q12, q8, #1 + vrshr.s16 q13, q9, #1 + vrshr.s16 q14, q10, #1 + vrshr.s16 q15, q11, #1 +.endif + transpose_4x8h q12, q13, q14, q15 + vswp d27, d29 + vswp d26, d28 + vswp d27, d30 + vswp d25, d28 + + b 2f +1: +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +2: + vmov.i16 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr +.ifc \variant, identity_ + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q8, q9, q10, q11, d0[0] +.else + blx r4 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + vswp d19, d21 + vswp d18, d20 + vswp d19, d22 + vswp d17, d20 + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon + mov r10, #\eob_half +.else + movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + sub_sp_align 256 + +.irp i, 0, 4 + add r6, sp, #(\i*16*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #8*2 + blx r9 +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_8x16_neon + sub_sp_align 256 + +.irp i, 0, 8 + add r6, sp, #(\i*8*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + + vmov.i16 q2, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128] + vst1.16 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.ifc \variant, identity_ + // The identity shl #1 and downshift vrshr #1 cancel out +.else + blx r4 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \j, \j, #1 +.endr +.endif + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + vst1.16 {q8, q9}, [r6, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_8h_x8_neon + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.else +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon +.else + movrel_local r4, inv_\txfm1\()_4h_x16_neon + movrel_local r9, inv_txfm_horz_scale_16x4_neon +.endif + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.endif +.if \w == 8 + mov r10, #\eob_8x8 +.else + mov r10, #\eob_4x4 +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43, 10 +def_fn_816 \w, \h, identity, identity, 43, 10 +def_fn_816 \w, \h, dct, adst, 43, 10 +def_fn_816 \w, \h, dct, flipadst, 43, 10 +def_fn_816 \w, \h, dct, identity, 8, 4 +def_fn_816 \w, \h, adst, dct, 43, 10 +def_fn_816 \w, \h, adst, adst, 43, 10 +def_fn_816 \w, \h, adst, flipadst, 43, 10 +def_fn_816 \w, \h, flipadst, dct, 43, 10 +def_fn_816 \w, \h, flipadst, adst, 43, 10 +def_fn_816 \w, \h, flipadst, flipadst, 43, 10 +def_fn_816 \w, \h, identity, dct, 64, 4 +def_fn_816 \w, \h, adst, identity, 8, 4 +def_fn_816 \w, \h, flipadst, identity, 8, 4 +def_fn_816 \w, \h, identity, adst, 64, 4 +def_fn_816 \w, \h, identity, flipadst, 64, 4 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs, 2*16 + vld1.16 {q0, q1}, [r12, :128] + sub r12, r12, #2*16 + + vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a + vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a + vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a + vqrshrn.s32 d16, q2, #12 // t16a + vqrshrn.s32 d31, q3, #12 // t31a + vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a + vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a + vqrshrn.s32 d24, q4, #12 // t17a + vqrshrn.s32 d23, q2, #12 // t30a + vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a + vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a + vqrshrn.s32 d20, q3, #12 // t18a + vqrshrn.s32 d27, q4, #12 // t29a + vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a + vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a + vqrshrn.s32 d28, q2, #12 // t19a + vqrshrn.s32 d19, q3, #12 // t28a + vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a + vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a + vqrshrn.s32 d18, q4, #12 // t20a + vqrshrn.s32 d29, q2, #12 // t27a + vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a + vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a + vqrshrn.s32 d26, q3, #12 // t21a + vqrshrn.s32 d21, q4, #12 // t26a + vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a + vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a + vqrshrn.s32 d22, q2, #12 // t22a + vqrshrn.s32 d25, q3, #12 // t25a + vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a + vqrshrn.s32 d30, q4, #12 // t23a + vqrshrn.s32 d17, q2, #12 // t24a + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d24 // t17 + vqadd.s16 d16, d16, d24 // t16 + vqsub.s16 d3, d31, d23 // t30 + vqadd.s16 d31, d31, d23 // t31 + vqsub.s16 d24, d28, d20 // t18 + vqadd.s16 d28, d28, d20 // t19 + vqadd.s16 d23, d18, d26 // t20 + vqsub.s16 d18, d18, d26 // t21 + vqsub.s16 d20, d30, d22 // t22 + vqadd.s16 d30, d30, d22 // t23 + vqadd.s16 d26, d17, d25 // t24 + vqsub.s16 d17, d17, d25 // t25 + vqsub.s16 d22, d29, d21 // t26 + vqadd.s16 d29, d29, d21 // t27 + vqadd.s16 d25, d19, d27 // t28 + vqsub.s16 d19, d19, d27 // t29 + + vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a + vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a + vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a + vqrshrn.s32 d21, q2, #12 // t17a + vqrshrn.s32 d27, q3, #12 // t30a + vneg.s32 q4, q4 // -> t18a + vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a + vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a + vqrshrn.s32 d19, q4, #12 // t18a + vqrshrn.s32 d24, q1, #12 // t29a + vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a + vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a + vqrshrn.s32 d22, q2, #12 // t21a + vqrshrn.s32 d18, q3, #12 // t26a + vneg.s32 q4, q4 // -> t22a + vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a + vqrshrn.s32 d17, q4, #12 // t22a + vqrshrn.s32 d20, q1, #12 // t25a + + vqsub.s16 d2, d27, d24 // t29 + vqadd.s16 d27, d27, d24 // t30 + vqsub.s16 d3, d21, d19 // t18 + vqadd.s16 d21, d21, d19 // t17 + vqsub.s16 d24, d16, d28 // t19a + vqadd.s16 d16, d16, d28 // t16a + vqsub.s16 d19, d30, d23 // t20a + vqadd.s16 d30, d30, d23 // t23a + vqsub.s16 d28, d17, d22 // t21 + vqadd.s16 d17, d17, d22 // t22 + vqadd.s16 d23, d26, d29 // t24a + vqsub.s16 d26, d26, d29 // t27a + vqadd.s16 d22, d20, d18 // t25 + vqsub.s16 d20, d20, d18 // t26 + vqsub.s16 d29, d31, d25 // t28a + vqadd.s16 d31, d31, d25 // t31a + + vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a + vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a + vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 + vqrshrn.s32 d18, q2, #12 // t18a + vqrshrn.s32 d25, q3, #12 // t29a + vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 + vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 + vqrshrn.s32 d29, q4, #12 // t19 + vqrshrn.s32 d24, q1, #12 // t28 + vneg.s32 q2, q2 // -> t20 + vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 + vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a + vqrshrn.s32 d26, q2, #12 // t20 + vqrshrn.s32 d19, q3, #12 // t27 + vneg.s32 q4, q4 // -> t21a + vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a + vqrshrn.s32 d20, q4, #12 // t21a + vqrshrn.s32 d28, q1, #12 // t26a + + vqsub.s16 d2, d16, d30 // t23 + vqadd.s16 d16, d16, d30 // t16 = out16 + vqsub.s16 d3, d31, d23 // t24 + vqadd.s16 d31, d31, d23 // t31 = out31 + vqsub.s16 d23, d21, d17 // t22a + vqadd.s16 d17, d21, d17 // t17a = out17 + vqadd.s16 d30, d27, d22 // t30a = out30 + vqsub.s16 d21, d27, d22 // t25a + vqsub.s16 d27, d18, d20 // t21 + vqadd.s16 d18, d18, d20 // t18 = out18 + vqadd.s16 d4, d29, d26 // t19a = out19 + vqsub.s16 d26, d29, d26 // t20a + vqadd.s16 d29, d25, d28 // t29 = out29 + vqsub.s16 d25, d25, d28 // t26 + vqadd.s16 d28, d24, d19 // t28a = out28 + vqsub.s16 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 + vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 + vqrshrn.s32 d20, q2, #12 // t20 + vqrshrn.s32 d22, q3, #12 // t27 + + vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a + vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vqrshrn.s32 d26, q2, #12 // t26a + + vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 + vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 + vqrshrn.s32 d21, q3, #12 // t21a + vqrshrn.s32 d22, q12, #12 // t22 + vqrshrn.s32 d25, q2, #12 // t25 + + vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a + vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a + vqrshrn.s32 d23, q2, #12 // t23a + vqrshrn.s32 d24, q3, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + push {lr} + vmov.i16 d7, #0 + lsl r8, r8, #1 +.if \scale + movw r12, #2896*8 + vdup.16 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_4h_x16_neon + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! + add r6, r6, #32 +.endm + store1 d16, d20, d24, d28 + store1 d17, d21, d25, d29 + store1 d18, d22, d26, d30 + store1 d19, d23, d27, d31 +.purgem store1 + sub r6, r6, #64*4 + + vmov.i16 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_4h_x16_neon + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q13, q12, d27, d26, d25, d24 + transpose_4x4h q11, q10, d23, d22, d21, d20 + transpose_4x4h q9, q8, d19, d18, d17, d16 +.macro store2 r0, r1, r2, r3, shift + vld1.16 {q0, q1}, [r6, :128] + vqsub.s16 d7, d0, \r0 + vqadd.s16 d0, d0, \r0 + vqsub.s16 d6, d1, \r1 + vqadd.s16 d1, d1, \r1 + vqsub.s16 d5, d2, \r2 + vqadd.s16 d2, d2, \r2 + vqsub.s16 d4, d3, \r3 + vqadd.s16 d3, d3, \r3 + vrev64.16 q2, q2 + vrev64.16 q3, q3 + vrshr.s16 q0, q0, #\shift + vrshr.s16 q1, q1, #\shift + vrshr.s16 q2, q2, #\shift + vrshr.s16 q3, q3, #\shift + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d27, d23, d19, \shift + store2 d30, d26, d22, d18, \shift + store2 d29, d25, d21, d17, \shift + store2 d28, d24, d20, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl inv_dct_4h_x16_neon + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl inv_dct32_odd_4h_x16_neon + + neg r9, r8 + mov r10, r6 +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.32 {d2[0]}, [r10, :32], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.32 {d2[1]}, [r10, :32], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.32 {d3[0]}, [r10, :32], r1 + \op\().s16 d5, d5, \r1 + vld1.32 {d3[1]}, [r10, :32], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vaddw.u8 q2, q2, d2 + \op\().s16 d7, d7, \r3 + vqmovun.s16 d2, q2 + vrshr.s16 q3, q3, #4 + vst1.32 {d2[0]}, [r6, :32], r1 + vaddw.u8 q3, q3, d3 + vst1.32 {d2[1]}, [r6, :32], r1 + vqmovun.s16 d3, q3 + vst1.32 {d3[0]}, [r6, :32], r1 + vst1.32 {d3[1]}, [r6, :32], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + // Contrary to the others, this one is only ever used in increments of 8x8 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + push {r4-r7,lr} + vmov.i16 q0, #0 + movrel_local r5, eob_32x32, 2 + + mov r6, #2*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 2 +2: + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r6 +.endr + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + load_add_store_8x8 r0, r7, shiftbits=2 + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r7,lr} + movw r6, #2896*8 + movw r7, #2*(5793-4096)*8 + vdup.i16 d0, r6 + movrel_local r5, eob_16x32\hshort, 2 + vmov.16 d0[1], r7 + + mov r6, #2*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 2 +2: + vmov.i16 q1, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s16, 1 + identity_8x8 d0[1] +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + +.if \w == 16 + load_add_store_8x8 r0, r7, shiftbits=2 +.else + load_add_store_8x8 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q0, #0 + movrel_local r4, eob_8x32 + + mov r12, #2*\h +1: + ldrh lr, [r4], #2 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r12 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs vrshr.s16, 1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + cmp r3, lr +.if \w == 8 + load_add_store_8x8 r0, r5, shiftbits=2 +.else + load_add_store_8x8 r0, r5, shiftbits=3 +.endif + + blt 9f +.if \w == 8 + sub r2, r2, r12, lsl #3 + add r2, r2, #2*8 +.else + sub r0, r0, r1, lsl #3 + add r0, r0, #8 +.endif + b 1b + +9: + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r5, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32 + + mov r8, #2*32 + mov r9, #32 + mov r6, sp +1: + vmov.i16 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #2 + sub r2, r2, r8, lsl #3 + sub r9, r9, #8 + add r2, r2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #2 +.endr + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #8 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + +.irp i, 0, 4 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + cmp r3, #10 + blt 1f +.endif + mov r8, #8*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + +2: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl inv_dct_8h_x8_neon + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.16 {d0, d1, d2}, [r12, :64]! + + vqrdmulh.s16 d23, d16, d0[1] // t63a + vqrdmulh.s16 d16, d16, d0[0] // t32a + vqrdmulh.s16 d22, d17, d0[2] // t62a + vqrdmulh.s16 d17, d17, d0[3] // t33a + vqrdmulh.s16 d21, d18, d1[1] // t61a + vqrdmulh.s16 d18, d18, d1[0] // t34a + vqrdmulh.s16 d20, d19, d1[2] // t60a + vqrdmulh.s16 d19, d19, d1[3] // t35a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t33 + vqsub.s16 d26, d19, d18 // t34 + vqadd.s16 d27, d19, d18 // t35 + vqadd.s16 d28, d20, d21 // t60 + vqsub.s16 d29, d20, d21 // t61 + vqsub.s16 d30, d23, d22 // t62 + vqadd.s16 d31, d23, d22 // t63 + + vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a + vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a + vneg.s32 q2, q2 // t34a + vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a + vqrshrn.s32 d26, q2, #12 // t34a + vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a + vqrshrn.s32 d29, q3, #12 // t61a + vqrshrn.s32 d25, q4, #12 // t33a + vqrshrn.s32 d30, q2, #12 // t62a + + vqadd.s16 d16, d24, d27 // t32a + vqsub.s16 d19, d24, d27 // t35a + vqadd.s16 d17, d25, d26 // t33 + vqsub.s16 d18, d25, d26 // t34 + vqsub.s16 d20, d31, d28 // t60a + vqadd.s16 d23, d31, d28 // t63a + vqsub.s16 d21, d30, d29 // t61 + vqadd.s16 d22, d30, d29 // t62 + + vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a + vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a + vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 + vqrshrn.s32 d21, q2, #12 // t61a + vqrshrn.s32 d18, q3, #12 // t34a + vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 + vqrshrn.s32 d20, q4, #12 // t60 + vqrshrn.s32 d19, q2, #12 // t35 + + vst1.16 {d16, d17, d18, d19}, [r6, :128]! + vst1.16 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #2*4*0] // t32a + vldr d17, [r9, #2*4*8] // t39a + vldr d18, [r9, #2*4*0] // t63a + vldr d19, [r6, #2*4*8] // t56a + vldr d20, [r6, #2*4*16] // t40a + vldr d21, [r9, #2*4*24] // t47a + vldr d22, [r9, #2*4*16] // t55a + vldr d23, [r6, #2*4*24] // t48a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t39 + vqadd.s16 d26, d18, d19 // t63 + vqsub.s16 d27, d18, d19 // t56 + vqsub.s16 d28, d21, d20 // t40 + vqadd.s16 d29, d21, d20 // t47 + vqadd.s16 d30, d23, d22 // t48 + vqsub.s16 d31, d23, d22 // t55 + + vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a + vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a + vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a + vqrshrn.s32 d25, q2, #12 // t56a + vqrshrn.s32 d27, q3, #12 // t39a + vneg.s32 q4, q4 // t40a + vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a + vqrshrn.s32 d31, q4, #12 // t40a + vqrshrn.s32 d28, q2, #12 // t55a + + vqadd.s16 d16, d24, d29 // t32a + vqsub.s16 d19, d24, d29 // t47a + vqadd.s16 d17, d27, d31 // t39 + vqsub.s16 d18, d27, d31 // t40 + vqsub.s16 d20, d26, d30 // t48a + vqadd.s16 d23, d26, d30 // t63a + vqsub.s16 d21, d25, d28 // t55 + vqadd.s16 d22, d25, d28 // t56 + + vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a + vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a + vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 + vqrshrn.s32 d18, q2, #12 // t40a + vqrshrn.s32 d21, q3, #12 // t55a + vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 + vqrshrn.s32 d19, q4, #12 // t47 + vqrshrn.s32 d20, q2, #12 // t48 + + vstr d16, [r6, #2*4*0] // t32a + vstr d17, [r9, #2*4*0] // t39 + vstr d18, [r6, #2*4*8] // t40a + vstr d19, [r9, #2*4*8] // t47 + vstr d20, [r6, #2*4*16] // t48 + vstr d21, [r9, #2*4*16] // t55a + vstr d22, [r6, #2*4*24] // t56 + vstr d23, [r9, #2*4*24] // t63a + + add r6, r6, #2*4 + sub r9, r9, #2*4 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.16 {\i}, [\src, :64] + vst1.16 {\zero}, [\src, :64], \strd +.else + vld1.16 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.16 {q8, q9}, [\dst, :128]! + vst1.16 {q10, q11}, [\dst, :128]! + vst1.16 {q12, q13}, [\dst, :128]! + vst1.16 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i16 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + movw \gpr, \val + vdup.16 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.16 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_4h_x16_neon + + store16 r6 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_4h_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.16 {d2}, [r6, :64]! + vld1.16 {d3}, [r6, :64]! + vqadd.s16 d6, d2, \r0 + vqsub.s16 \r0, d2, \r0 + vld1.16 {d4}, [r6, :64]! + vqadd.s16 d7, d3, \r1 + vqsub.s16 \r1, d3, \r1 + vld1.16 {d5}, [r6, :64]! + vqadd.s16 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s16 \r2, d4, \r2 + vst1.16 {d6}, [r6, :64]! + vst1.16 {\r0}, [r10, :64], r9 + vqadd.s16 d3, d5, \r3 + vqsub.s16 \r3, d5, \r3 + vst1.16 {d7}, [r6, :64]! + vst1.16 {\r1}, [r10, :64], r9 + vst1.16 {d2}, [r6, :64]! + vst1.16 {\r2}, [r10, :64], r9 + vst1.16 {d3}, [r6, :64]! + vst1.16 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.16 {d16}, [r7, :64] // in1 (offset 0) + vld1.16 {d17}, [r9, :64] // in31 (offset 15) + vld1.16 {d18}, [r10, :64] // in17 (offset 8) + vld1.16 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.16 {d16}, [r10, :64] // in7 (offset 3) + vld1.16 {d17}, [r11, :64] // in25 (offset 12) + vld1.16 {d18}, [r9, :64] // in23 (offset 11) + vld1.16 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.16 d16, [r10, :64] // in5 (offset 2) + vld1.16 d17, [r11, :64] // in27 (offset 13) + vld1.16 d18, [r9, :64] // in21 (offset 10) + vld1.16 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.16 d16, [r10, :64] // in3 (offset 1) + vld1.16 d17, [r11, :64] // in29 (offset 14) + vld1.16 d18, [r9, :64] // in19 (offset 9) + vld1.16 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x4_neon + vdup.16 q3, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q13, q12, d27, d26, d25, d24 + +.macro store_addsub src0, src1, src2, src3 + vqsub.s16 d3, \src0, \src1 + vqsub.s16 d2, \src2, \src3 + vqadd.s16 d0, \src0, \src1 + vqadd.s16 d1, \src2, \src3 + vrshl.s16 q1, q1, q3 + vrshl.s16 q0, q0, q3 + vrev64.16 q1, q1 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q1}, [r9, :128], r10 +.endm + store_addsub d16, d31, d20, d27 + store_addsub d17, d30, d21, d26 + store_addsub d18, d29, d22, d25 + store_addsub d19, d28, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #2 + sub r9, r9, r10, lsl #2 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + +.macro add_dest_addsub src0, src1, src2, src3 + vld1.32 {d0[0]}, [r6, :32], r1 + vld1.32 {d1[0]}, [r9, :32], r10 + vqadd.s16 d4, \src0, \src1 + vld1.32 {d0[1]}, [r6, :32] + vqadd.s16 d5, \src2, \src3 + vld1.32 {d1[1]}, [r9, :32] + vqsub.s16 d6, \src0, \src1 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vaddw.u8 q2, q2, d0 + vaddw.u8 q3, q3, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 + vst1.32 {d0[0]}, [r6, :32], r1 + vst1.32 {d1[0]}, [r9, :32], r10 + vst1.32 {d0[1]}, [r6, :32], r1 + vst1.32 {d1[1]}, [r9, :32], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_scale_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 4, 8, 12 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel_local r5, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S new file mode 100644 index 0000000000..aa6c272e71 --- /dev/null +++ b/third_party/dav1d/src/arm/32/itx16.S @@ -0,0 +1,3625 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +// A macro for cases where a thumb mov can express the constant in one +// instruction, while arm mode requires two separate movw+movt pairs. +.macro mov_const reg, val +#if CONFIG_THUMB + mov.w \reg, #\val +#else + movw \reg, #((\val) & 0xffff) + movt \reg, #(((\val) >> 16) & 0xffff) +#endif +.endm + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro vmul_vmla d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmla.i32 \d0, \s1, \c1 +.endm + +.macro vmul_vmls d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmls.i32 \d0, \s1, \c1 +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s32 \r0, \r0, \c + vqrdmulh.s32 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s32 \r2, \r2, \c + vqrdmulh.s32 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s32 \r4, \r4, \c + vqrdmulh.s32 \r5, \r5, \c + vqrdmulh.s32 \r6, \r6, \c + vqrdmulh.s32 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + vld1.16 {\load}, [\src, :128], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store + vst1.16 {\store}, [\dst, :128], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits + load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits + load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits + load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits + load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits + load_add_store , , , , , q15, q14, \dst, \src, \shiftbits + load_add_store , , , , , , q15, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits + load_add_store , , , , , q11, q10, \dst, \src, \shiftbits + load_add_store , , , , , , q11, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4 +.ifnb \load1 + vld1.16 {\load1}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \load2 + vld1.16 {\load2}, [\src, :64], r1 +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \store1 + vst1.16 {\store1}, [\dst, :64], r1 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store2 + vst1.16 {\store2}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src + load_add_store4 d2, d3, q9, , , , , , , \dst, \src + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src + load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src + load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src + load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src + load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src + load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src + load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src + load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src + load_add_store4 , , , , , , q15, d28, d29, \dst, \src + load_add_store4 , , , , , , , d30, d31, \dst, \src +.endm +.macro load_add_store_4x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits + load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits +.endm +.macro load_add_store_4x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d24[], d25[]}, [r2, :32] + vdup.32 d0, r12 + vqrdmulh.s32 q13, q12, d0[0] + vst1.32 {d28[0]}, [r2, :32] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s32 q13, q13, d0[0] +.endif +.if \shift > 0 + vqrshrn.s32 d24, q13, #\shift + vqrshrn.s32 d25, q13, #\shift +.else + vqmovn.s32 d24, q13 + vqmovn.s32 d25, q13 +.endif + vqrdmulh.s16 q12, q12, d0[1] + mov r3, #\h + vrshr.s16 q12, q12, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vld1.16 {d2}, [r0, :64], r1 + vld1.16 {d3}, [r0, :64], r1 + subs r3, r3, #4 + vqadd.s16 q0, q0, q12 + sub r0, r0, r1, lsl #2 + vqadd.s16 q1, q1, q12 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmin.s16 q0, q0, q15 + vst1.16 {d0}, [r0, :64], r1 + vmin.s16 q1, q1, q15 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0}, [r0, :128], r1 + subs r3, r3, #4 + vld1.16 {q1}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vld1.16 {q2}, [r0, :128], r1 + vqadd.s16 q1, q1, q12 + vld1.16 {q3}, [r0, :128], r1 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #2 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vst1.16 {q0}, [r0, :128], r1 + vmin.s16 q2, q2, q15 + vst1.16 {q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.16 {q2, q3}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #1 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon + sub r1, r1, #32 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128] + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, #32 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #96 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128]! + vqadd.s16 q0, q0, q12 + vld1.16 {q8, q9}, [r0, :128]! + vqadd.s16 q1, q1, q12 + vld1.16 {q10, q11}, [r0, :128] + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + vqadd.s16 q8, q8, q12 + vqadd.s16 q9, q9, q12 + vqadd.s16 q10, q10, q12 + vqadd.s16 q11, q11, q12 + sub r0, r0, #96 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmax.s16 q10, q10, q14 + vmax.s16 q11, q11, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q8, q8, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q9, q9, q15 + vst1.16 {q2, q3}, [r0, :128]! + vmin.s16 q10, q10, q15 + vst1.16 {q8, q9}, [r0, :128]! + vmin.s16 q11, q11, q15 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i32 q8, q8, q9 + vsub.i32 q13, q10, q11 + vsub.i32 q12, q8, q13 + vshr.s32 q12, q12, #1 + vsub.i32 q10, q12, q9 + vsub.i32 q9, q12, q11 + vadd.i32 q11, q13, q10 + vsub.i32 q8, q8, q9 +.endm + +.macro idct_4s_x4 r0, r1, r2, r3 + vmul_vmla q4, \r1, \r3, d1[1], d1[0] + vmul_vmla q2, \r0, \r2, d0[0], d0[0] + vmul_vmls q3, \r1, \r3, d1[0], d1[1] + vmul_vmls q5, \r0, \r2, d0[0], d0[0] + vrshr.s32 q4, q4, #12 + vrshr.s32 q2, q2, #12 + vrshr.s32 q3, q3, #12 + vrshr.s32 q5, q5, #12 + vqadd.s32 \r0, q2, q4 + vqsub.s32 \r3, q2, q4 + vqadd.s32 \r1, q5, q3 + vqsub.s32 \r2, q5, q3 +.endm + +.macro idct_2s_x4 r0, r1, r2, r3 + vmul_vmla d6, \r1, \r3, d1[1], d1[0] + vmul_vmla d4, \r0, \r2, d0[0], d0[0] + vmul_vmls d5, \r1, \r3, d1[0], d1[1] + vmul_vmls d7, \r0, \r2, d0[0], d0[0] + vrshr.s32 d6, d6, #12 + vrshr.s32 d4, d4, #12 + vrshr.s32 d5, d5, #12 + vrshr.s32 d7, d7, #12 + vqadd.s32 \r0, d4, d6 + vqsub.s32 \r3, d4, d6 + vqadd.s32 \r1, d7, d5 + vqsub.s32 \r2, d7, d5 +.endm + +function inv_dct_4s_x4_neon + movrel_local r12, idct_coeffs + vld1.32 {d0, d1}, [r12, :128] + idct_4s_x4 q8, q9, q10, q11 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.32 {d0, d1}, [r12, :128] + + vsub.i32 q1, q8, q10 + vmul.i32 q2, q8, d0[0] + vmla.i32 q2, q10, d0[1] + vmla.i32 q2, q11, d1[0] + vmul.i32 q4, q9, d1[1] + vadd.i32 q1, q1, q11 + vmul.i32 q3, q8, d1[0] + vmls.i32 q3, q10, d0[0] + vmls.i32 q3, q11, d0[1] + + vadd.i32 \o3, q2, q3 + vmul.i32 \o2, q1, d1[1] + vadd.i32 \o0, q2, q4 + vadd.i32 \o1, q3, q4 + vsub.i32 \o3, \o3, q4 + + vrshr.s32 \o0, \o0, #12 + vrshr.s32 \o2, \o2, #12 + vrshr.s32 \o1, \o1, #12 + vrshr.s32 \o3, \o3, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 q8, q9, q10, q11 + bx lr +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x4_neon + mov r12, #0 + movt r12, #(5793-4096)*8 + vdup.32 d0, r12 + vqrdmulh.s32 q1, q8, d0[0] + vqrdmulh.s32 q2, q9, d0[0] + vqrdmulh.s32 q3, q10, d0[0] + vqrdmulh.s32 q4, q11, d0[0] + vqadd.s32 q8, q8, q1 + vqadd.s32 q9, q9, q2 + vqadd.s32 q10, q10, q3 + vqadd.s32 q11, q11, q4 + bx lr +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q14, q15}, [r2, :128]! + vshr.s16 q8, q8, #2 + vld1.32 {q10, q11}, [r2, :128] + vshr.s16 q9, q9, #2 + vshr.s16 q10, q10, #2 + vshr.s16 q11, q11, #2 + + iwht4 + + vst1.32 {q14, q15}, [r2, :128] + transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + + iwht4 + + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d16, q8 + vld1.16 {d1}, [r0, :64], r1 + vqmovn.s32 d17, q9 + vld1.16 {d2}, [r0, :64], r1 + vqmovn.s32 d18, q10 + vld1.16 {d3}, [r0, :64], r1 + vqmovn.s32 d19, q11 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.32 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vrshr.s16 q8, q8, #4 + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q9, q9, #4 + vld1.16 {d3}, [r0, :64], r1 + +L(itx_4x4_end): + vmvn.i16 q15, #0xfc00 // 0x3ff + sub r0, r0, r1, lsl #2 + vqadd.s16 q8, q8, q0 + vqadd.s16 q9, q9, q1 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmin.s16 q8, q8, q15 + vmin.s16 q9, q9, q15 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + + vpop {q4-q5} + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d16[], d17[]}, [r2, :32] + vdup.32 d4, r12 + vst1.32 {d28[0]}, [r2, :32] + vqrdmulh.s32 q8, q8, d4[0] + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d20, q8 + vqmovn.s32 d21, q8 + vld1.16 {d1}, [r0, :64], r1 + vqrdmulh.s16 q10, q10, d4[1] + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q8, q10, #4 + vld1.16 {d3}, [r0, :64], r1 + vrshr.s16 q9, q10, #4 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4s_x4_neon + movrel r5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4s_x4 \r0, \r2, \r4, \r6 + + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, q5 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, q2, #12 // t4a + vrshr.s32 \r7, q3, #12 // t7a + vrshr.s32 \r3, q6, #12 // t5a + vrshr.s32 \r5, q7, #12 // t6a + + vqadd.s32 q2, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 q3, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + +.irp r, q2, \r1, q3, \r3 + vmin.s32 \r, \r, q5 +.endr +.irp r, q2, \r1, q3, \r3 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 q7, q7, #12 // t5 + vrshr.s32 q5, q6, #12 // t6 + + vqsub.s32 \r7, \r0, q3 // out7 + vqadd.s32 \r0, \r0, q3 // out0 + vqadd.s32 \r1, \r2, q5 // out1 + vqsub.s32 q6, \r2, q5 // out6 + vqadd.s32 \r2, \r4, q7 // out2 + vqsub.s32 \r5, \r4, q7 // out5 + vqadd.s32 \r3, \r6, q2 // out3 + vqsub.s32 \r4, \r6, q2 // out4 + vmov \r6, q6 // out6 +.endm + +.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_2s_x4 \r0, \r2, \r4, \r6 + + vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, d9 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, d8 +.endr + + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, d4, #12 // t4a + vrshr.s32 \r7, d5, #12 // t7a + vrshr.s32 \r3, d6, #12 // t5a + vrshr.s32 \r5, d7, #12 // t6a + + vqadd.s32 d4, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 d5, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + +.irp r, d4, \r1, d5, \r3 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, \r1, d5, \r3 + vmax.s32 \r, \r, d8 +.endr + + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 d6, d6, #12 // t5 + vrshr.s32 d7, d7, #12 // t6 + + vqsub.s32 \r7, \r0, d5 // out7 + vqadd.s32 \r0, \r0, d5 // out0 + vqadd.s32 \r1, \r2, d7 // out1 + vqsub.s32 d7, \r2, d7 // out6 + vqadd.s32 \r2, \r4, d6 // out2 + vqsub.s32 \r5, \r4, d6 // out5 + vqadd.s32 \r3, \r6, d4 // out3 + vqsub.s32 \r4, \r6, d4 // out4 + vmov \r6, d7 // out6 +.endm + +function inv_dct_4s_x8_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128] + idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla q2, q15, q8, d0[0], d0[1] + vmul_vmls q3, q15, q8, d0[1], d0[0] + vmul_vmla q4, q13, q10, d1[0], d1[1] + vrshr.s32 q8, q2, #12 // t0a + vrshr.s32 q15, q3, #12 // t1a + vmul_vmls q5, q13, q10, d1[1], d1[0] + vmul_vmla q6, q11, q12, d2[0], d2[1] + vrshr.s32 q10, q4, #12 // t2a + vrshr.s32 q13, q5, #12 // t3a + vmul_vmls q7, q11, q12, d2[1], d2[0] + vmul_vmla q2, q9, q14, d3[0], d3[1] + vrshr.s32 q12, q6, #12 // t4a + vrshr.s32 q11, q7, #12 // t5a + vmul_vmls q3, q9, q14, d3[1], d3[0] + vrshr.s32 q14, q2, #12 // t6a + vrshr.s32 q9, q3, #12 // t7a + + vld1.32 {q0}, [r12] + + vqadd.s32 q2, q8, q12 // t0 + vqsub.s32 q3, q8, q12 // t4 + vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vqadd.s32 q4, q15, q11 // t1 + vqsub.s32 q5, q15, q11 // t5 + vqadd.s32 q6, q10, q14 // t2 + vqsub.s32 q7, q10, q14 // t6 + vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqadd.s32 q10, q13, q9 // t3 + vqsub.s32 q11, q13, q9 // t7 + +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmax.s32 \r, \r, q14 +.endr + + vmul_vmla q8, q3, q5, d1[1], d1[0] + vmul_vmls q13, q3, q5, d1[0], d1[1] + vmul_vmls q14, q11, q7, d1[1], d1[0] + + vrshr.s32 q3, q8, #12 // t4a + vrshr.s32 q5, q13, #12 // t5a + + vmul_vmla q8, q11, q7, d1[0], d1[1] + + vrshr.s32 q7, q14, #12 // t6a + vrshr.s32 q11, q8, #12 // t7a + + vqadd.s32 \r0, q2, q6 // out0 + vqsub.s32 q2, q2, q6 // t2 + vqadd.s32 \r7, q4, q10 // out7 + vqsub.s32 q4, q4, q10 // t3 + + vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + + vqadd.s32 \r1, q3, q7 // out1 + vqsub.s32 q3, q3, q7 // t6 + vqadd.s32 \r6, q5, q11 // out6 + vqsub.s32 q5, q5, q11 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, q2, q4, q3, q5 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q4, q3, q5 + vmax.s32 \r, \r, q10 +.endr + + vqneg.s32 \r7, \r7 // out7 + vqneg.s32 \r1, \r1 // out1 + + vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) + vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11) + vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10) + vrshr.s32 q2, q10, #12 // out3 + vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13) + vrshr.s32 q3, q12, #12 // out5 + vrshr.s32 \r2, q10, #12 // out2 (q10 or q13) + vrshr.s32 \r4, q6, #12 // out4 (q12 or q11) + + vqneg.s32 \r3, q2 // out3 + vqneg.s32 \r5, q3 // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +function inv_flipadst_4s_x8_neon + iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x8_neon + vqshl.s32 q8, q8, #1 + vqshl.s32 q9, q9, #1 + vqshl.s32 q10, q10, #1 + vqshl.s32 q11, q11, #1 + vqshl.s32 q12, q12, #1 + vqshl.s32 q13, q13, #1 + vqshl.s32 q14, q14, #1 + vqshl.s32 q15, q15, #1 + bx lr +endfunc + +function inv_txfm_add_8x8_neon + vmov.i32 q0, #0 + mov r7, #8*4 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + cmp r3, r10 + transpose_4x8h q8, q9, q10, q11 + + blt 1f + + sub r2, r2, r7, lsl #3 + vpush {q8-q11} + + add r2, r2, #16 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d31, q15, #1 + vqrshrn.s32 d30, q11, #1 + vqrshrn.s32 d29, q14, #1 + vqrshrn.s32 d28, q10, #1 + vqrshrn.s32 d27, q13, #1 + vqrshrn.s32 d26, q9, #1 + vqrshrn.s32 d25, q12, #1 + vqrshrn.s32 d24, q8, #1 + vpop {q8-q11} + + transpose_4x8h q12, q13, q14, q15 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + vmov.i16 q14, #0 + vmov.i16 q15, #0 + +2: + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + mov r10, #\eob_half + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vdup.32 d4, r12 + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + + scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15 + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + vqmovn.s32 d20, q12 + vqmovn.s32 d21, q13 + vqmovn.s32 d22, q14 + vqmovn.s32 d23, q15 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +function inv_txfm_add_4x8_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + cmp r3, r10 + mov r7, #32 + blt 1f + + add r2, r2, #16 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + scale_input d2[0], q8, q9, q10, q11 + sub r2, r2, r7, lsl #2 + + blx r4 + + sub r2, r2, #16 + + vqmovn.s32 d24, q8 + vqmovn.s32 d25, q9 + vqmovn.s32 d26, q10 + vqmovn.s32 d27, q11 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + +2: + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + scale_input d2[0], q8, q9, q10, q11 + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + vmov q10, q12 + vmov q11, q13 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov r10, #\eob_half +.endif + movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_2s_x16_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + // idct_8 leaves the row_clip_max/min constants in d9 and d8 +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmin.s32 \r, \r, d9 +.endr +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmax.s32 \r, \r, d8 +.endr + + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #32 + + vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a + vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a + vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a + vrshr.s32 d17, d4, #12 // t8a + vrshr.s32 d31, d5, #12 // t15a + vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a + vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a + vrshr.s32 d23, d6, #12 // t9a + vrshr.s32 d25, d4, #12 // t14a + vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a + vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a + vrshr.s32 d21, d5, #12 // t10a + vrshr.s32 d27, d6, #12 // t13a + vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a + vrshr.s32 d19, d4, #12 // t11a + vrshr.s32 d29, d5, #12 // t12a + + vld1.32 {q0}, [r12, :128] + + vqsub.s32 d4, d17, d23 // t9 + vqadd.s32 d17, d17, d23 // t8 + vqsub.s32 d5, d31, d25 // t14 + vqadd.s32 d31, d31, d25 // t15 + vqsub.s32 d23, d19, d21 // t10 + vqadd.s32 d19, d19, d21 // t11 + vqadd.s32 d25, d29, d27 // t12 + vqsub.s32 d29, d29, d27 // t13 + +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmax.s32 \r, \r, d8 +.endr + + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a + vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a + vrshr.s32 d21, d6, #12 // t9a + vrshr.s32 d27, d7, #12 // t14a + + vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a + vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a + vrshr.s32 d29, d6, #12 // t13a + vneg.s32 d7, d7 + vrshr.s32 d23, d7, #12 // t10a + + vqsub.s32 d4, d17, d19 // t11a + vqadd.s32 d17, d17, d19 // t8a + vqsub.s32 d5, d31, d25 // t12a + vqadd.s32 d31, d31, d25 // t15a + vqadd.s32 d19, d21, d23 // t9 + vqsub.s32 d21, d21, d23 // t10 + vqsub.s32 d25, d27, d29 // t13 + vqadd.s32 d27, d27, d29 // t14 + +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmax.s32 \r, \r, d8 +.endr + + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 + vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 + vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a + + vrshr.s32 d6, d6, #12 // t11 + vrshr.s32 d7, d7, #12 // t12 + vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a + vrshr.s32 d4, d4, #12 // t10a + vrshr.s32 d5, d5, #12 // t13a + + vqadd.s32 d8, d16, d31 // out0 + vqsub.s32 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s32 d23, d30, d17 // out7 + vqsub.s32 d9, d30, d17 // out8 + vqadd.s32 d17, d18, d27 // out1 + vqsub.s32 d30, d18, d27 // out14 + vqadd.s32 d18, d20, d5 // out2 + vqsub.s32 d29, d20, d5 // out13 + vqadd.s32 d5, d28, d19 // out6 + vqsub.s32 d25, d28, d19 // out9 + vqadd.s32 d19, d22, d7 // out3 + vqsub.s32 d28, d22, d7 // out12 + vqadd.s32 d20, d24, d6 // out4 + vqsub.s32 d27, d24, d6 // out11 + vqadd.s32 d21, d26, d4 // out5 + vqsub.s32 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0 + vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1 + vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2 + vrshr.s32 d16, d4, #12 // t0 + vrshr.s32 d31, d6, #12 // t1 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3 + vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4 + vrshr.s32 d18, d8, #12 // t2 + vrshr.s32 d29, d4, #12 // t3 + vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5 + vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6 + vrshr.s32 d20, d6, #12 // t4 + vrshr.s32 d27, d8, #12 // t5 + vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7 + vld1.32 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8 + vrshr.s32 d22, d4, #12 // t6 + vrshr.s32 d25, d6, #12 // t7 + vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9 + vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10 + vrshr.s32 d23, d8, #12 // t8 + vrshr.s32 d24, d4, #12 // t9 + vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11 + vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12 + vrshr.s32 d21, d6, #12 // t10 + vrshr.s32 d26, d8, #12 // t11 + vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13 + vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14 + vrshr.s32 d19, d4, #12 // t12 + vrshr.s32 d28, d6, #12 // t13 + vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15 + vrshr.s32 d17, d8, #12 // t14 + vrshr.s32 d30, d4, #12 // t15 + + vld1.32 {q0, q1}, [r12, :128] + + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + + vqsub.s32 d5, d16, d23 // t8a + vqadd.s32 d16, d16, d23 // t0a + vqsub.s32 d7, d31, d24 // t9a + vqadd.s32 d31, d31, d24 // t1a + vqadd.s32 d23, d18, d21 // t2a + vqsub.s32 d18, d18, d21 // t10a + vqadd.s32 d24, d29, d26 // t3a + vqsub.s32 d29, d29, d26 // t11a + vqadd.s32 d21, d20, d19 // t4a + vqsub.s32 d20, d20, d19 // t12a + vqadd.s32 d26, d27, d28 // t5a + vqsub.s32 d27, d27, d28 // t13a + vqadd.s32 d19, d22, d17 // t6a + vqsub.s32 d22, d22, d17 // t14a + vqadd.s32 d28, d25, d30 // t7a + vqsub.s32 d25, d25, d30 // t15a + +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmax.s32 \r, \r, d10 +.endr + + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 + vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 + vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 + vrshr.s32 d17, d4, #12 // t8 + vrshr.s32 d30, d6, #12 // t9 + vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11 + vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12 + vrshr.s32 d18, d8, #12 // t10 + vrshr.s32 d29, d4, #12 // t11 + vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13 + vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14 + vrshr.s32 d27, d6, #12 // t12 + vrshr.s32 d20, d8, #12 // t13 + vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15 + vrshr.s32 d25, d4, #12 // t14 + vrshr.s32 d22, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t4 + vqadd.s32 d16, d16, d21 // t0 + vqsub.s32 d3, d31, d26 // t5 + vqadd.s32 d31, d31, d26 // t1 + vqadd.s32 d21, d23, d19 // t2 + vqsub.s32 d23, d23, d19 // t6 + vqadd.s32 d26, d24, d28 // t3 + vqsub.s32 d24, d24, d28 // t7 + vqadd.s32 d19, d17, d27 // t8a + vqsub.s32 d17, d17, d27 // t12a + vqadd.s32 d28, d30, d20 // t9a + vqsub.s32 d30, d30, d20 // t13a + vqadd.s32 d27, d18, d25 // t10a + vqsub.s32 d18, d18, d25 // t14a + vqadd.s32 d20, d29, d22 // t11a + vqsub.s32 d29, d29, d22 // t15a + +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmax.s32 \r, \r, d10 +.endr + + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a + vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a + vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a + vrshr.s32 d22, d4, #12 // t4a + vrshr.s32 d25, d6, #12 // t5a + vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a + vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12 + vrshr.s32 d24, d8, #12 // t6a + vrshr.s32 d23, d4, #12 // t7a + vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14 + vrshr.s32 d17, d6, #12 // t12 + vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15 + vrshr.s32 d29, d8, #12 // t13 + vrshr.s32 d30, d4, #12 // t14 + vrshr.s32 d18, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s32 \o0, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 +.else + vqadd.s32 d4, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + + vqsub.s32 d3, d29, d18 // t15a + vqadd.s32 \o13,d29, d18 // out13 + vqadd.s32 \o2, d17, d30 // out2 + vqsub.s32 d26, d17, d30 // t14a + + vqadd.s32 \o1, d19, d27 // out1 + vqsub.s32 d27, d19, d27 // t10 + vqadd.s32 \o14,d28, d20 // out14 + vqsub.s32 d20, d28, d20 // t11 + + vqadd.s32 \o3, d22, d24 // out3 + vqsub.s32 d22, d22, d24 // t6 + vqadd.s32 \o12,d25, d23 // out12 + vqsub.s32 d23, d25, d23 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmax.s32 \r, \r, d10 +.endr + + vqneg.s32 \o15, \o15 // out15 + vqneg.s32 \o13,\o13 // out13 + vqneg.s32 \o1, \o1 // out1 + vqneg.s32 \o3, \o3 // out3 + + vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshr.s32 d24, d24, #12 // out8 + vrshr.s32 d4, d4, #12 // out7 + vrshr.s32 d5, d6, #12 // out5 + vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshr.s32 d26, d8, #12 // out10 + + vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshr.s32 \o4, d2, #12 // out4 + vrshr.s32 d7, d6, #12 // out9 + vrshr.s32 d6, d8, #12 // out11 + vrshr.s32 \o6, d22, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s32 \o7, d4 // out7 + vqneg.s32 \o5, d5 // out5 + vqneg.s32 \o11,d6 // out11 + vqneg.s32 \o9, d7 // out9 +.endm + +function inv_adst_2s_x16_neon + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_2s_x16_neon + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_2s_x16_neon + mov r12, #0 + movt r12, #2*(5793-4096)*8 + vdup.32 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q1, \i, d0[0] + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_8x4_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vrshr.s32 q2, q2, #1 + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro identity_8x4 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x2_neon + push {lr} + vmov.i32 d7, #0 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + blx r4 + vqrshrn.s32 d16, q8, #\shift + vqrshrn.s32 d17, q9, #\shift + vqrshrn.s32 d18, q10, #\shift + vqrshrn.s32 d19, q11, #\shift + vqrshrn.s32 d20, q12, #\shift + vqrshrn.s32 d21, q13, #\shift + vqrshrn.s32 d22, q14, #\shift + vqrshrn.s32 d23, q15, #\shift + vuzp.16 q8, q9 + vuzp.16 q10, q11 + +.irp i, q8, q10, q9, q11 + vst1.16 {\i}, [r6, :128]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 3, 10, 21, 36, 55, 78, 105, 256 +endconst + +const eob_16x16_identity + .short 2, 4, 6, 8, 10, 12, 14, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + cmp r3, r10 + mov r11, #16 + blt 1f + + add r6, r2, #8 + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r6, :64] + vst1.32 {d4}, [r6, :64], r11 +.endr + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + vpush {q8-q11} + + b 2f + +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 + mov r6, sp + vpush {q8-q9} + vpush {q8-q9} + +2: + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r2, :64] + vst1.32 {d4}, [r2, :64], r11 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + + vmov q12, q10 + vmov q13, q11 + + vpop {q10-q11} + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + + vpop {q10-q11} + vmov q8, q12 + vmov q9, q13 + blx r5 + add r6, r0, #16 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_4x16_neon + ldrh r9, [r10, #4] + + mov r11, #64 + cmp r3, r9 + ldrh r9, [r10, #2] + blt 1f + + add r6, r2, #48 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d28, q8, #1 + vqrshrn.s32 d29, q9, #1 + vqrshrn.s32 d30, q10, #1 + vqrshrn.s32 d31, q11, #1 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + b 2f +1: + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + cmp r3, r9 + ldrh r9, [r10] + blt 1f + + add r6, r2, #32 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d24, q8, #1 + vqrshrn.s32 d25, q9, #1 + vqrshrn.s32 d26, q10, #1 + vqrshrn.s32 d27, q11, #1 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 +2: + cmp r3, r9 + blt 1f + + add r6, r2, #16 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + b 2f +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 +2: + vmov.i16 q2, #0 + vpush {q8-q9} +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + vpop {q10-q11} + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_4x16 +.else + movrel_local r10, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_4x16_identity2 +.else + movrel_local r10, eob_4x16 +.endif +.endif +.else + mov r10, #\eob_16x4 + movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 3 +def_fn_416 \w, \h, identity, identity, 3 +def_fn_416 \w, \h, dct, adst, 3 +def_fn_416 \w, \h, dct, flipadst, 3 +def_fn_416 \w, \h, dct, identity, 2 +def_fn_416 \w, \h, adst, dct, 3 +def_fn_416 \w, \h, adst, adst, 3 +def_fn_416 \w, \h, adst, flipadst, 3 +def_fn_416 \w, \h, flipadst, dct, 3 +def_fn_416 \w, \h, flipadst, adst, 3 +def_fn_416 \w, \h, flipadst, flipadst, 3 +def_fn_416 \w, \h, identity, dct, 2 +def_fn_416 \w, \h, adst, identity, 2 +def_fn_416 \w, \h, flipadst, identity, 2 +def_fn_416 \w, \h, identity, adst, 2 +def_fn_416 \w, \h, identity, flipadst, 2 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +function inv_txfm_add_16x8_neon + sub_sp_align 256 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(8 - \i) + cmp r3, r11 + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #8*4 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i*2) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_8x16_neon + add r10, r10, #2 + sub_sp_align 256 + ldrh r11, [r10], #4 + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*8*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #4 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + + mov_const r12, 2896*8*(1<<16) + vmov.i32 q2, #0 + vdup.32 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\j}, [r7, :128] + vst1.32 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 +.irp j, d16, d20, d17, d21, d18, d22, d19, d23 + vst1.16 {\j}, [r6, :64]! +.endr +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_8x16 + .short 3, 10, 21, 43, 59, 75, 91, 128 +endconst + +const eob_8x16_identity1 + .short 2, 4, 6, 64, 80, 96, 112, 128 +endconst + +const eob_8x16_identity2 + .short 2, 4, 6, 8, 10, 12, 14, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.else + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) +.endif +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_8x16 +.else + movrel_local r10, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_8x16_identity2 +.else + movrel_local r10, eob_8x16 +.endif +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_2s_x16_neon + movrel_local r12, idct_coeffs, 4*16 + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a + vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a + vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a + vrshr.s32 d16, d4, #12 // t16a + vrshr.s32 d31, d6, #12 // t31a + vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a + vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a + vrshr.s32 d24, d8, #12 // t17a + vrshr.s32 d23, d4, #12 // t30a + vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a + vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a + vrshr.s32 d20, d6, #12 // t18a + vrshr.s32 d27, d8, #12 // t29a + vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #4*24 + vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a + vrshr.s32 d28, d4, #12 // t19a + vrshr.s32 d19, d6, #12 // t28a + vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a + vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a + vrshr.s32 d18, d8, #12 // t20a + vrshr.s32 d29, d4, #12 // t27a + vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a + vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a + vrshr.s32 d26, d6, #12 // t21a + vrshr.s32 d21, d8, #12 // t26a + vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a + vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a + vrshr.s32 d22, d4, #12 // t22a + vrshr.s32 d25, d6, #12 // t25a + vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a + vrshr.s32 d30, d8, #12 // t23a + vrshr.s32 d17, d4, #12 // t24a + + vld1.32 {q0, q1}, [r12, :128] + + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + + vqsub.s32 d5, d16, d24 // t17 + vqadd.s32 d16, d16, d24 // t16 + vqsub.s32 d7, d31, d23 // t30 + vqadd.s32 d31, d31, d23 // t31 + vqsub.s32 d24, d28, d20 // t18 + vqadd.s32 d28, d28, d20 // t19 + vqadd.s32 d23, d18, d26 // t20 + vqsub.s32 d18, d18, d26 // t21 + vqsub.s32 d20, d30, d22 // t22 + vqadd.s32 d30, d30, d22 // t23 + vqadd.s32 d26, d17, d25 // t24 + vqsub.s32 d17, d17, d25 // t25 + vqsub.s32 d22, d29, d21 // t26 + vqadd.s32 d29, d29, d21 // t27 + vqadd.s32 d25, d19, d27 // t28 + vqsub.s32 d19, d19, d27 // t29 + +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmax.s32 \r, \r, d10 +.endr + + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a + vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a + vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a + vrshr.s32 d21, d4, #12 // t17a + vrshr.s32 d27, d6, #12 // t30a + vneg.s32 d8, d8 // -> t18a + vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a + vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a + vrshr.s32 d19, d8, #12 // t18a + vrshr.s32 d24, d5, #12 // t29a + vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a + vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a + vrshr.s32 d22, d4, #12 // t21a + vrshr.s32 d18, d6, #12 // t26a + vneg.s32 d8, d8 // -> t22a + vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a + vrshr.s32 d17, d8, #12 // t22a + vrshr.s32 d20, d5, #12 // t25a + + vqsub.s32 d2, d27, d24 // t29 + vqadd.s32 d27, d27, d24 // t30 + vqsub.s32 d3, d21, d19 // t18 + vqadd.s32 d21, d21, d19 // t17 + vqsub.s32 d24, d16, d28 // t19a + vqadd.s32 d16, d16, d28 // t16a + vqsub.s32 d19, d30, d23 // t20a + vqadd.s32 d30, d30, d23 // t23a + vqsub.s32 d28, d17, d22 // t21 + vqadd.s32 d17, d17, d22 // t22 + vqadd.s32 d23, d26, d29 // t24a + vqsub.s32 d26, d26, d29 // t27a + vqadd.s32 d22, d20, d18 // t25 + vqsub.s32 d20, d20, d18 // t26 + vqsub.s32 d29, d31, d25 // t28a + vqadd.s32 d31, d31, d25 // t31a + +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmax.s32 \r, \r, d10 +.endr + + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a + vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a + vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 + vrshr.s32 d18, d4, #12 // t18a + vrshr.s32 d25, d6, #12 // t29a + vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28 + vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20 + vrshr.s32 d29, d8, #12 // t19 + vrshr.s32 d24, d5, #12 // t28 + vneg.s32 d4, d4 // -> t20 + vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27 + vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a + vrshr.s32 d26, d4, #12 // t20 + vrshr.s32 d19, d6, #12 // t27 + vneg.s32 d8, d8 // -> t21a + vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a + vrshr.s32 d20, d8, #12 // t21a + vrshr.s32 d28, d5, #12 // t26a + + vqsub.s32 d2, d16, d30 // t23 + vqadd.s32 d16, d16, d30 // t16 = out16 + vqsub.s32 d3, d31, d23 // t24 + vqadd.s32 d31, d31, d23 // t31 = out31 + vqsub.s32 d23, d21, d17 // t22a + vqadd.s32 d17, d21, d17 // t17a = out17 + vqadd.s32 d30, d27, d22 // t30a = out30 + vqsub.s32 d21, d27, d22 // t25a + vqsub.s32 d27, d18, d20 // t21 + vqadd.s32 d18, d18, d20 // t18 = out18 + vqadd.s32 d4, d29, d26 // t19a = out19 + vqsub.s32 d26, d29, d26 // t20a + vqadd.s32 d29, d25, d28 // t29 = out29 + vqsub.s32 d25, d25, d28 // t26 + vqadd.s32 d28, d24, d19 // t28a = out28 + vqsub.s32 d24, d24, d19 // t27a + vmov d19, d4 // out19 + +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmax.s32 \r, \r, d10 +.endr + + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 + vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 + vrshr.s32 d20, d4, #12 // t20 + vrshr.s32 d22, d6, #12 // t27 + + vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a + vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshr.s32 d26, d4, #12 // t26a + + vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22 + vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25 + vrshr.s32 d21, d6, #12 // t21a + vrshr.s32 d22, d24, #12 // t22 + vrshr.s32 d25, d4, #12 // t25 + + vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a + vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a + vrshr.s32 d23, d4, #12 // t23a + vrshr.s32 d24, d6, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x2_neon + push {lr} + vmov.i32 d7, #0 + lsl r8, r8, #1 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_2s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d24, d25 + vtrn.32 d26, d27 + vtrn.32 d28, d29 + vtrn.32 d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! +.endm + store1 d16, d18, d20, d22 + store1 d24, d26, d28, d30 + store1 d17, d19, d21, d23 + store1 d25, d27, d29, d31 +.purgem store1 + sub r6, r6, #64*2 + + vmov.i32 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_2s_x16_neon + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + vtrn.32 d23, d22 + vtrn.32 d21, d20 + vtrn.32 d19, d18 + vtrn.32 d17, d16 +.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift + vld1.32 {q0, q1}, [r6, :128]! + vld1.32 {q2, q3}, [r6, :128] + sub r6, r6, #32 + vqsub.s32 d15, d0, \r0 + vqadd.s32 d0, d0, \r0 + vqsub.s32 d14, d1, \r1 + vqadd.s32 d1, d1, \r1 + vqsub.s32 d13, d2, \r2 + vqadd.s32 d2, d2, \r2 + vqsub.s32 d12, d3, \r3 + vqadd.s32 d3, d3, \r3 + vqsub.s32 d11, d4, \r4 + vqadd.s32 d4, d4, \r4 + vqsub.s32 d10, d5, \r5 + vqadd.s32 d5, d5, \r5 + vqsub.s32 d9, d6, \r6 + vqadd.s32 d6, d6, \r6 + vqsub.s32 d8, d7, \r7 + vqadd.s32 d7, d7, \r7 + vqrshrn.s32 d0, q0, #\shift + vqrshrn.s32 d1, q1, #\shift + vqrshrn.s32 d2, q2, #\shift + vqrshrn.s32 d3, q3, #\shift + vqrshrn.s32 d4, q4, #\shift + vqrshrn.s32 d5, q5, #\shift + vqrshrn.s32 d6, q6, #\shift + vqrshrn.s32 d7, q7, #\shift + vrev32.16 q2, q2 + vrev32.16 q3, q3 + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift + store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl X(inv_dct_4h_x16_neon) + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl X(inv_dct32_odd_4h_x16_neon) + + neg r9, r8 + mov r10, r6 + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.16 {d0}, [r10, :64], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.16 {d1}, [r10, :64], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.16 {d2}, [r10, :64], r1 + \op\().s16 d5, d5, \r1 + vld1.16 {d3}, [r10, :64], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vqadd.s16 q0, q0, q2 + \op\().s16 d7, d7, \r3 + vmax.s16 q0, q0, q6 + vrshr.s16 q3, q3, #4 + vmin.s16 q0, q0, q7 + vqadd.s16 q1, q1, q3 + vst1.16 {d0}, [r6, :64], r1 + vmax.s16 q1, q1, q6 + vst1.16 {d1}, [r6, :64], r1 + vmin.s16 q1, q1, q7 + vst1.16 {d2}, [r6, :64], r1 + vst1.16 {d3}, [r6, :64], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024 +endconst + +const eob_16x32 + .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512 +endconst + +const eob_16x32_shortside + .short 3, 10, 21, 36, 55, 78, 105, 512 +endconst + +const eob_8x32 + .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q6-q7} + movrel_local r5, eob_32x32, 2 + + mov r6, #4*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 6 +2: + vmov.i32 q0, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r6 +.endr + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + + load_add_store_8x4 r0, r7, shiftbits=2 + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r9,lr} + vpush {q6-q7} + mov r9, #0 + mov_const r8, 2896*8*(1<<16) + movt r9, #2*(5793-4096)*8 + movrel_local r5, eob_16x32\hshort, 2 + + mov r6, #4*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 6 +2: + vdup.i32 d0, r8 + vmov.i32 q1, #0 + vmov.32 d0[1], r9 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x4_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s32, 1 + identity_8x4 d0[1] +.endif + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + +.if \w == 16 + load_add_store_8x4 r0, r7, shiftbits=2 +.else + load_add_store_8x4 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r9,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q6-q7} + movrel_local r4, eob_8x32, 2 + + mov r12, #4*\h +1: + ldrh lr, [r4], #4 +.if \w == 8 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r12 +.endr + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + transpose_4x8h q8, q9, q10, q11 + + cmp r3, lr + load_add_store_8x4 r0, r5, shiftbits=2 + blt 9f + sub r2, r2, r12, lsl #3 + add r2, r2, #4*4 +.else + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q10, q11}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q12, q13}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q14, q15}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q10 + vqmovn.s32 d20, q9 + vqmovn.s32 d21, q11 + vqmovn.s32 d18, q12 + vqmovn.s32 d19, q14 + vqmovn.s32 d22, q13 + vqmovn.s32 d23, q15 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + + cmp r3, lr + load_add_store_4x8 r0, r5, shiftbits=3 + blt 9f + sub r0, r0, r1, lsl #3 + add r0, r0, #2*4 +.endif + b 1b + +9: + vpop {q6-q7} + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_2s_x16_neon + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*32 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel r5, X(inv_dct_4h_x16_neon) + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*16 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32, 2 + + mov r8, #4*32 + mov r9, #32 + mov r6, sp +1: + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #4 + sub r2, r2, r8, lsl #3 + sub r9, r9, #4 + add r2, r2, #4*4 + + bl inv_dct_4s_x8_neon + + vqrshrn.s32 d16, q8, #2 + vqrshrn.s32 d18, q9, #2 + vqrshrn.s32 d20, q10, #2 + vqrshrn.s32 d22, q11, #2 + vqrshrn.s32 d17, q12, #2 + vqrshrn.s32 d19, q13, #2 + vqrshrn.s32 d21, q14, #2 + vqrshrn.s32 d23, q15, #2 + + transpose_4x8h q8, q9, q10, q11 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r10, eob_8x32 + sub_sp_align 512 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + cmp r3, r11 + mov r8, #(8 - \i) + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #8*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9, lsl #1 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.32 {q0, q1}, [r12, :128]! + + vqrdmulh.s32 d23, d16, d0[1] // t63a + vqrdmulh.s32 d16, d16, d0[0] // t32a + vqrdmulh.s32 d22, d17, d1[0] // t62a + vqrdmulh.s32 d17, d17, d1[1] // t33a + vqrdmulh.s32 d21, d18, d2[1] // t61a + vqrdmulh.s32 d18, d18, d2[0] // t34a + vqrdmulh.s32 d20, d19, d3[0] // t60a + vqrdmulh.s32 d19, d19, d3[1] // t35a + + vld1.32 {q0}, [r12, :128]! + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t33 + vqsub.s32 d26, d19, d18 // t34 + vqadd.s32 d27, d19, d18 // t35 + vqadd.s32 d28, d20, d21 // t60 + vqsub.s32 d29, d20, d21 // t61 + vqsub.s32 d30, d23, d22 // t62 + vqadd.s32 d31, d23, d22 // t63 + +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a + vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a + vneg.s32 d4, d4 // t34a + vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a + vrshr.s32 d26, d4, #12 // t34a + vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a + vrshr.s32 d29, d6, #12 // t61a + vrshr.s32 d25, d7, #12 // t33a + vrshr.s32 d30, d4, #12 // t62a + + vqadd.s32 d16, d24, d27 // t32a + vqsub.s32 d19, d24, d27 // t35a + vqadd.s32 d17, d25, d26 // t33 + vqsub.s32 d18, d25, d26 // t34 + vqsub.s32 d20, d31, d28 // t60a + vqadd.s32 d23, d31, d28 // t63a + vqsub.s32 d21, d30, d29 // t61 + vqadd.s32 d22, d30, d29 // t62 + +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a + vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a + vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 + vrshr.s32 d21, d4, #12 // t61a + vrshr.s32 d18, d6, #12 // t34a + vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 + vrshr.s32 d20, d7, #12 // t60 + vrshr.s32 d19, d4, #12 // t35 + + vst1.32 {d16, d17, d18, d19}, [r6, :128]! + vst1.32 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.32 {q0}, [r12, :128] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #4*2*0] // t32a + vldr d17, [r9, #4*2*8] // t39a + vldr d18, [r9, #4*2*0] // t63a + vldr d19, [r6, #4*2*8] // t56a + vldr d20, [r6, #4*2*16] // t40a + vldr d21, [r9, #4*2*24] // t47a + vldr d22, [r9, #4*2*16] // t55a + vldr d23, [r6, #4*2*24] // t48a + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t39 + vqadd.s32 d26, d18, d19 // t63 + vqsub.s32 d27, d18, d19 // t56 + vqsub.s32 d28, d21, d20 // t40 + vqadd.s32 d29, d21, d20 // t47 + vqadd.s32 d30, d23, d22 // t48 + vqsub.s32 d31, d23, d22 // t55 + +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a + vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a + vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a + vrshr.s32 d25, d4, #12 // t56a + vrshr.s32 d27, d6, #12 // t39a + vneg.s32 d7, d7 // t40a + vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a + vrshr.s32 d31, d7, #12 // t40a + vrshr.s32 d28, d4, #12 // t55a + + vqadd.s32 d16, d24, d29 // t32a + vqsub.s32 d19, d24, d29 // t47a + vqadd.s32 d17, d27, d31 // t39 + vqsub.s32 d18, d27, d31 // t40 + vqsub.s32 d20, d26, d30 // t48a + vqadd.s32 d23, d26, d30 // t63a + vqsub.s32 d21, d25, d28 // t55 + vqadd.s32 d22, d25, d28 // t56 + +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a + vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a + vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 + vrshr.s32 d18, d4, #12 // t40a + vrshr.s32 d21, d6, #12 // t55a + vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 + vrshr.s32 d19, d7, #12 // t47 + vrshr.s32 d20, d4, #12 // t48 + + vstr d16, [r6, #4*2*0] // t32a + vstr d17, [r9, #4*2*0] // t39 + vstr d18, [r6, #4*2*8] // t40a + vstr d19, [r9, #4*2*8] // t47 + vstr d20, [r6, #4*2*16] // t48 + vstr d21, [r9, #4*2*16] // t55a + vstr d22, [r6, #4*2*24] // t56 + vstr d23, [r9, #4*2*24] // t63a + + add r6, r6, #4*2 + sub r9, r9, #4*2 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.32 {\i}, [\src, :64] + vst1.32 {\zero}, [\src, :64], \strd +.else + vld1.32 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.32 {q8, q9}, [\dst, :128]! + vst1.32 {q10, q11}, [\dst, :128]! + vst1.32 {q12, q13}, [\dst, :128]! + vst1.32 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i32 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i32 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + mov_const \gpr, \val + vdup.32 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.32 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_2s_x64_neon + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_2s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + + store16 r6 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_2s_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + + vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.macro store_addsub r0, r1, r2, r3 + vld1.32 {d2}, [r6, :64]! + vld1.32 {d3}, [r6, :64]! + vqadd.s32 d6, d2, \r0 + vqsub.s32 \r0, d2, \r0 + vld1.32 {d4}, [r6, :64]! + vqadd.s32 d7, d3, \r1 + vqsub.s32 \r1, d3, \r1 + vmin.s32 d6, d6, d1 + vmin.s32 \r0, \r0, d1 + vld1.32 {d5}, [r6, :64]! + vqadd.s32 d2, d4, \r2 + sub r6, r6, #8*4 + vmax.s32 d6, d6, d0 + vmax.s32 \r0, \r0, d0 + vqsub.s32 \r2, d4, \r2 + vmin.s32 d7, d7, d1 + vmin.s32 \r1, \r1, d1 + vst1.32 {d6}, [r6, :64]! + vst1.32 {\r0}, [r10, :64], r9 + vmin.s32 d2, d2, d1 + vmin.s32 \r2, \r2, d1 + vmax.s32 d7, d7, d0 + vmax.s32 \r1, \r1, d0 + vqadd.s32 d3, d5, \r3 + vqsub.s32 \r3, d5, \r3 + vmax.s32 d2, d2, d0 + vmax.s32 \r2, \r2, d0 + vmin.s32 d3, d3, d1 + vmin.s32 \r3, \r3, d1 + vst1.32 {d7}, [r6, :64]! + vst1.32 {\r1}, [r10, :64], r9 + vmax.s32 d3, d3, d0 + vmax.s32 \r3, \r3, d0 + vst1.32 {d2}, [r6, :64]! + vst1.32 {\r2}, [r10, :64], r9 + vst1.32 {d3}, [r6, :64]! + vst1.32 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.32 {d16}, [r7, :64] // in1 (offset 0) + vld1.32 {d17}, [r9, :64] // in31 (offset 15) + vld1.32 {d18}, [r10, :64] // in17 (offset 8) + vld1.32 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.32 {d16}, [r10, :64] // in7 (offset 3) + vld1.32 {d17}, [r11, :64] // in25 (offset 12) + vld1.32 {d18}, [r9, :64] // in23 (offset 11) + vld1.32 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.32 d16, [r10, :64] // in5 (offset 2) + vld1.32 d17, [r11, :64] // in27 (offset 13) + vld1.32 d18, [r9, :64] // in21 (offset 10) + vld1.32 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.32 d16, [r10, :64] // in3 (offset 1) + vld1.32 d17, [r11, :64] // in29 (offset 14) + vld1.32 d18, [r9, :64] // in19 (offset 9) + vld1.32 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x2_neon + vdup.32 q4, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.32 {d16, d17, d18, d19}, [r7, :128]! + vld1.32 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.32 {d20, d21, d22, d23}, [r7, :128]! + vld1.32 {d24, d25, d26, d27}, [r8, :128], r11 + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + +.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7 + vqsub.s32 d7, \src0, \src1 + vqsub.s32 d6, \src2, \src3 + vqsub.s32 d5, \src4, \src5 + vqsub.s32 d4, \src6, \src7 + vqadd.s32 d0, \src0, \src1 + vqadd.s32 d1, \src2, \src3 + vqadd.s32 d2, \src4, \src5 + vqadd.s32 d3, \src6, \src7 + vrshl.s32 q3, q3, q4 + vrshl.s32 q2, q2, q4 + vrshl.s32 q0, q0, q4 + vrshl.s32 q1, q1, q4 + vqmovn.s32 d7, q3 + vqmovn.s32 d6, q2 + vqmovn.s32 d0, q0 + vqmovn.s32 d1, q1 + vrev32.16 q3, q3 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q3}, [r9, :128], r10 +.endm + store_addsub d16, d31, d18, d29, d20, d27, d22, d25 + store_addsub d17, d30, d19, d28, d21, d26, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #1 + sub r9, r9, r10, lsl #1 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + vld1.16 {d0}, [r6, :64], r1 + vld1.16 {d1}, [r9, :64], r10 + vqadd.s16 d4, \src0, \src1 + vld1.16 {d2}, [r6, :64] + vqsub.s16 d5, \src0, \src1 + vld1.16 {d3}, [r9, :64] + vqadd.s16 d6, \src2, \src3 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vqadd.s16 q2, q2, q0 + vqadd.s16 q3, q3, q1 + vmax.s16 q2, q2, q6 + vmax.s16 q3, q3, q6 + vmin.s16 q2, q2, q7 + vmin.s16 q3, q3, q7 + vst1.16 {d4}, [r6, :64], r1 + vst1.16 {d5}, [r9, :64], r10 + vst1.16 {d6}, [r6, :64], r1 + vst1.16 {d7}, [r9, :64], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_scale_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel r5, X(inv_dct_4h_x16_neon) +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_2s_x16_neon +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/loopfilter.S b/third_party/dav1d/src/arm/32/loopfilter.S new file mode 100644 index 0000000000..97b960534f --- /dev/null +++ b/third_party/dav1d/src/arm/32/loopfilter.S @@ -0,0 +1,868 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_8_wd\wd\()_neon + vabd.u8 d0, d22, d23 // abs(p1 - p0) + vabd.u8 d1, d25, d24 // abs(q1 - q0) + vabd.u8 d2, d23, d24 // abs(p0 - q0) + vabd.u8 d3, d22, d25 // abs(p1 - q1) +.if \wd >= 6 + vabd.u8 d4, d21, d22 // abs(p2 - p1) + vabd.u8 d5, d26, d25 // abs(q2 - q1) +.endif +.if \wd >= 8 + vabd.u8 d6, d20, d21 // abs(p3 - p2) + vabd.u8 d7, d27, d26 // abs(q3 - q3) +.endif +.if \wd >= 6 + vmax.u8 d4, d4, d5 +.endif + vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2 +.if \wd >= 8 + vmax.u8 d6, d6, d7 +.endif + vshr.u8 d3, d3, #1 +.if \wd >= 8 + vmax.u8 d4, d4, d6 +.endif +.if \wd >= 6 + vand d4, d4, d14 +.endif + vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) + vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + vmax.u8 d4, d0, d4 + vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + vand d1, d1, d2 // fm + vand d1, d1, d13 // fm && wd >= 4 +.if \wd >= 6 + vand d14, d14, d1 // fm && wd > 4 +.endif +.if \wd >= 16 + vand d15, d15, d1 // fm && wd == 16 +.endif + + vmov r10, r11, d1 + orrs r10, r10, r11 + beq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + vmov.i8 d10, #1 + vabd.u8 d2, d21, d23 // abs(p2 - p0) + vabd.u8 d3, d22, d23 // abs(p1 - p0) + vabd.u8 d4, d25, d24 // abs(q1 - q0) + vabd.u8 d5, d26, d24 // abs(q2 - q0) +.if \wd >= 8 + vabd.u8 d6, d20, d23 // abs(p3 - p0) + vabd.u8 d7, d27, d24 // abs(q3 - q0) +.endif + vmax.u8 d2, d2, d3 + vmax.u8 d4, d4, d5 +.if \wd >= 8 + vmax.u8 d6, d6, d7 +.endif + vmax.u8 d2, d2, d4 +.if \wd >= 8 + vmax.u8 d2, d2, d6 +.endif + +.if \wd == 16 + vabd.u8 d3, d17, d23 // abs(p6 - p0) + vabd.u8 d4, d18, d23 // abs(p5 - p0) + vabd.u8 d5, d19, d23 // abs(p4 - p0) +.endif + vcge.u8 d2, d10, d2 // flat8in +.if \wd == 16 + vabd.u8 d6, d28, d24 // abs(q4 - q0) + vabd.u8 d7, d29, d24 // abs(q5 - q0) + vabd.u8 d8, d30, d24 // abs(q6 - q0) +.endif + vand d14, d2, d14 // flat8in && fm && wd > 4 + vbic d1, d1, d14 // fm && wd >= 4 && !flat8in +.if \wd == 16 + vmax.u8 d3, d3, d4 + vmax.u8 d5, d5, d6 +.endif + vmov r10, r11, d1 +.if \wd == 16 + vmax.u8 d7, d7, d8 + vmax.u8 d3, d3, d5 + vmax.u8 d3, d3, d7 + vcge.u8 d3, d10, d3 // flat8out +.endif + orrs r10, r10, r11 +.if \wd == 16 + vand d15, d15, d3 // flat8out && fm && wd == 16 + vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 + vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out +.endif + beq 1f // skip wd == 4 case +.endif + + vsubl.u8 q1, d22, d25 // p1 - q1 + vcgt.u8 d0, d0, d12 // hev + vqmovn.s16 d2, q1 + vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) + vbic d0, d1, d0 // (fm && wd >= 4 && !hev) + vsubl.u8 q1, d24, d23 + vmov.i16 q3, #3 + vmul.i16 q1, q1, q3 + vmov.i8 d6, #4 + vaddw.s8 q1, q1, d4 + vmov.i8 d7, #3 + vqmovn.s16 d2, q1 // f + vqadd.s8 d4, d6, d2 // imin(f + 4, 127) + vqadd.s8 d5, d7, d2 // imin(f + 3, 127) + vshr.s8 d4, d4, #3 // f1 + vshr.s8 d5, d5, #3 // f2 + vmovl.u8 q1, d23 // p0 + vmovl.u8 q3, d24 // q0 + vaddw.s8 q1, q1, d5 + vsubw.s8 q3, q3, d4 + vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1 + vqmovun.s16 d2, q1 // out p0 + vqmovun.s16 d6, q3 // out q0 + vbit d23, d2, d1 // if (fm && wd >= 4) + vmovl.u8 q1, d22 // p1 + vbit d24, d6, d1 // if (fm && wd >= 4) + vmovl.u8 q3, d25 // q1 + vaddw.s8 q1, q1, d4 + vsubw.s8 q3, q3, d4 + vqmovun.s16 d2, q1 // out p1 + vqmovun.s16 d6, q3 // out q1 + vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) + vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 2f // skip if there's no flat8in + + vaddl.u8 q0, d21, d21 // p2 * 2 + vaddl.u8 q1, d21, d22 // p2 + p1 + vaddl.u8 q2, d22, d23 // p1 + p0 + vaddl.u8 q3, d23, d24 // p0 + q0 + vadd.i16 q4, q0, q1 + vadd.i16 q5, q2, q3 + vaddl.u8 q6, d24, d25 // q0 + q1 + vadd.i16 q4, q4, q5 + vsub.i16 q6, q6, q0 + vaddl.u8 q5, d25, d26 // q1 + q2 + vrshrn.i16 d0, q4, #3 // out p1 + + vadd.i16 q4, q4, q6 + vsub.i16 q5, q5, q1 + vaddl.u8 q6, d26, d26 // q2 + q2 + vrshrn.i16 d1, q4, #3 // out p0 + + vadd.i16 q4, q4, q5 + vsub.i16 q6, q6, q2 + vrshrn.i16 d2, q4, #3 // out q0 + + vbit d22, d0, d14 // p1 if (flat8in) + vadd.i16 q4, q4, q6 + vbit d23, d1, d14 // p0 if (flat8in) + vrshrn.i16 d3, q4, #3 // out q1 + vbit d24, d2, d14 // q0 if (flat8in) + vbit d25, d3, d14 // q1 if (flat8in) +.elseif \wd >= 8 + vmov r10, r11, d14 + orrs r10, r10, r11 +.if \wd == 8 + beq 8f // skip if there's no flat8in +.else + beq 2f // skip if there's no flat8in +.endif + + vaddl.u8 q0, d20, d21 // p3 + p2 + vaddl.u8 q1, d22, d25 // p1 + q1 + vaddl.u8 q2, d20, d22 // p3 + p1 + vaddl.u8 q3, d23, d26 // p0 + q2 + vadd.i16 q4, q0, q0 // 2 * (p3 + p2) + vaddw.u8 q4, q4, d23 // + p0 + vaddw.u8 q4, q4, d24 // + q0 + vadd.i16 q4, q4, q2 // + p3 + p1 + vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2 + vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1 + vrshrn.i16 d10, q4, #3 // out p2 + + vadd.i16 q4, q4, q1 + vaddl.u8 q0, d20, d23 // p3 + p0 + vaddl.u8 q1, d24, d27 // q0 + q3 + vrshrn.i16 d11, q4, #3 // out p1 + + vadd.i16 q4, q4, q3 + vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0 + vaddl.u8 q2, d21, d24 // p2 + q0 + vaddl.u8 q3, d25, d27 // q1 + q3 + vrshrn.i16 d12, q4, #3 // out p0 + + vadd.i16 q4, q4, q1 + vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0 + vaddl.u8 q0, d22, d25 // p1 + q1 + vaddl.u8 q1, d26, d27 // q2 + q3 + vrshrn.i16 d13, q4, #3 // out q0 + + vadd.i16 q4, q4, q3 + vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1 + vrshrn.i16 d0, q4, #3 // out q1 + + vadd.i16 q4, q4, q1 + + vbit d21, d10, d14 + vbit d22, d11, d14 + vbit d23, d12, d14 + vrshrn.i16 d1, q4, #3 // out q2 + vbit d24, d13, d14 + vbit d25, d0, d14 + vbit d26, d1, d14 +.endif +2: +.if \wd == 16 + vmov r10, r11, d15 + orrs r10, r10, r11 + bne 1f // check if flat8out is needed + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + vaddl.u8 q1, d17, d17 // p6 + p6 + vaddl.u8 q2, d17, d18 // p6 + p5 + vaddl.u8 q3, d17, d19 // p6 + p4 + vaddl.u8 q4, d17, d20 // p6 + p3 + vadd.i16 q6, q1, q2 + vadd.i16 q5, q3, q4 + vaddl.u8 q3, d17, d21 // p6 + p2 + vadd.i16 q6, q6, q5 + vaddl.u8 q4, d17, d22 // p6 + p1 + vaddl.u8 q5, d18, d23 // p5 + p0 + vadd.i16 q3, q3, q4 + vaddl.u8 q4, d19, d24 // p4 + q0 + vadd.i16 q6, q6, q3 + vadd.i16 q5, q5, q4 + vaddl.u8 q3, d20, d25 // p3 + q1 + vadd.i16 q6, q6, q5 + vsub.i16 q3, q3, q1 + vaddl.u8 q1, d21, d26 // p2 + q2 + vrshrn.i16 d0, q6, #4 // out p5 + vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1) + vsub.i16 q1, q1, q2 + vaddl.u8 q2, d22, d27 // p1 + q3 + vaddl.u8 q3, d17, d19 // p6 + p4 + vrshrn.i16 d1, q6, #4 // out p4 + vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2) + vsub.i16 q2, q2, q3 + vaddl.u8 q3, d23, d28 // p0 + q4 + vaddl.u8 q4, d17, d20 // p6 + p3 + vrshrn.i16 d2, q6, #4 // out p3 + vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3) + vsub.i16 q3, q3, q4 + vaddl.u8 q4, d24, d29 // q0 + q5 + vaddl.u8 q2, d17, d21 // p6 + p2 + vrshrn.i16 d3, q6, #4 // out p2 + vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4) + vsub.i16 q4, q4, q2 + vaddl.u8 q3, d25, d30 // q1 + q6 + vaddl.u8 q5, d17, d22 // p6 + p1 + vrshrn.i16 d4, q6, #4 // out p1 + vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5) + vsub.i16 q3, q3, q5 + vaddl.u8 q4, d26, d30 // q2 + q6 + vbif d0, d18, d15 // out p5 + vaddl.u8 q5, d18, d23 // p5 + p0 + vrshrn.i16 d5, q6, #4 // out p0 + vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6) + vsub.i16 q4, q4, q5 + vaddl.u8 q5, d27, d30 // q3 + q6 + vbif d1, d19, d15 // out p4 + vaddl.u8 q9, d19, d24 // p4 + q0 + vrshrn.i16 d6, q6, #4 // out q0 + vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6) + vsub.i16 q5, q5, q9 + vaddl.u8 q4, d28, d30 // q4 + q6 + vbif d2, d20, d15 // out p3 + vaddl.u8 q9, d20, d25 // p3 + q1 + vrshrn.i16 d7, q6, #4 // out q1 + vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6) + vsub.i16 q9, q4, q9 + vaddl.u8 q5, d29, d30 // q5 + q6 + vbif d3, d21, d15 // out p2 + vaddl.u8 q10, d21, d26 // p2 + q2 + vrshrn.i16 d8, q6, #4 // out q2 + vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6) + vsub.i16 q5, q5, q10 + vaddl.u8 q9, d30, d30 // q6 + q6 + vbif d4, d22, d15 // out p1 + vaddl.u8 q10, d22, d27 // p1 + q3 + vrshrn.i16 d9, q6, #4 // out q3 + vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6) + vsub.i16 q9, q9, q10 + vbif d5, d23, d15 // out p0 + vrshrn.i16 d10, q6, #4 // out q4 + vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6) + vrshrn.i16 d11, q6, #4 // out q5 + vbif d6, d24, d15 // out q0 + vbif d7, d25, d15 // out q1 + vbif d8, d26, d15 // out q2 + vbif d9, d27, d15 // out q3 + vbif d10, d28, d15 // out q4 + vbif d11, d29, d15 // out q5 +.endif + + bx lr +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + bx r8 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + bx r9 +.endif +9: + // Return directly without writing back any pixels + bx r12 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_8_wd16 + adr r8, 7f + CONFIG_THUMB + adr r9, 8f + CONFIG_THUMB + bl lpf_8_wd16_neon +.endm + +.macro lpf_8_wd8 + adr r9, 8f + CONFIG_THUMB + bl lpf_8_wd8_neon +.endm + +.macro lpf_8_wd6 + bl lpf_8_wd6_neon +.endm + +.macro lpf_8_wd4 + bl lpf_8_wd4_neon +.endm + +function lpf_v_4_8_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + + lpf_8_wd4 + + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_4_8_neon + mov r12, lr + sub r10, r0, #2 + add r0, r10, r1, lsl #2 + vld1.32 {d22[0]}, [r10], r1 + vld1.32 {d22[1]}, [r0], r1 + vld1.32 {d23[0]}, [r10], r1 + vld1.32 {d23[1]}, [r0], r1 + vld1.32 {d24[0]}, [r10], r1 + vld1.32 {d24[1]}, [r0], r1 + vld1.32 {d25[0]}, [r10], r1 + vld1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + + transpose_4x8b q11, q12, d22, d23, d24, d25 + + lpf_8_wd4 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_6_8_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + lpf_8_wd6 + + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_6_8_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #2 + vld1.8 {d20}, [r10], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d21}, [r10], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d22}, [r10], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d23}, [r10], r1 + vld1.8 {d27}, [r0], r1 + add r0, r0, #4 + + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + + lpf_8_wd6 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_8_8_neon + mov r12, lr + sub r10, r0, r1, lsl #2 + vld1.8 {d20}, [r10, :64], r1 // p3 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d26}, [r0, :64], r1 // q2 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d27}, [r0, :64], r1 // q3 + sub r0, r0, r1, lsl #2 + + lpf_8_wd8 + + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vst1.8 {d21}, [r10, :64], r1 // p2 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d25}, [r0, :64], r1 // q1 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_8_8_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #2 + vld1.8 {d20}, [r10], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d21}, [r10], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d22}, [r10], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d23}, [r10], r1 + vld1.8 {d27}, [r0], r1 + add r0, r0, #4 + + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + + lpf_8_wd8 + + sub r10, r0, r1, lsl #3 + sub r10, r10, #4 + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + add r0, r10, r1, lsl #2 + + vst1.8 {d20}, [r10], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d21}, [r10], r1 + vst1.8 {d25}, [r0], r1 + vst1.8 {d22}, [r10], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d23}, [r10], r1 + vst1.8 {d27}, [r0], r1 + add r0, r0, #4 + bx r12 +8: + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +function lpf_v_16_8_neon + mov r12, lr + + sub r10, r0, r1, lsl #3 + add r10, r10, r1 + vld1.8 {d17}, [r10, :64], r1 // p6 + vld1.8 {d24}, [r0, :64], r1 // q0 + vld1.8 {d18}, [r10, :64], r1 // p5 + vld1.8 {d25}, [r0, :64], r1 // q1 + vld1.8 {d19}, [r10, :64], r1 // p4 + vld1.8 {d26}, [r0, :64], r1 // q2 + vld1.8 {d20}, [r10, :64], r1 // p3 + vld1.8 {d27}, [r0, :64], r1 // q3 + vld1.8 {d21}, [r10, :64], r1 // p2 + vld1.8 {d28}, [r0, :64], r1 // q4 + vld1.8 {d22}, [r10, :64], r1 // p1 + vld1.8 {d29}, [r0, :64], r1 // q5 + vld1.8 {d23}, [r10, :64], r1 // p0 + vld1.8 {d30}, [r0, :64], r1 // q6 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + + lpf_8_wd16 + + sub r10, r0, r1, lsl #2 + sub r10, r10, r1, lsl #1 + vst1.8 {d0}, [r10, :64], r1 // p5 + vst1.8 {d6}, [r0, :64], r1 // q0 + vst1.8 {d1}, [r10, :64], r1 // p4 + vst1.8 {d7}, [r0, :64], r1 // q1 + vst1.8 {d2}, [r10, :64], r1 // p3 + vst1.8 {d8}, [r0, :64], r1 // q2 + vst1.8 {d3}, [r10, :64], r1 // p2 + vst1.8 {d9}, [r0, :64], r1 // q3 + vst1.8 {d4}, [r10, :64], r1 // p1 + vst1.8 {d10}, [r0, :64], r1 // q4 + vst1.8 {d5}, [r10, :64], r1 // p0 + vst1.8 {d11}, [r0, :64], r1 // q5 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + bx r12 +7: + sub r10, r0, r1 + sub r10, r10, r1, lsl #1 + vst1.8 {d21}, [r10, :64], r1 // p2 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d25}, [r0, :64], r1 // q1 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.8 {d22}, [r10, :64], r1 // p1 + vst1.8 {d24}, [r0, :64], r1 // q0 + vst1.8 {d23}, [r10, :64], r1 // p0 + vst1.8 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_16_8_neon + mov r12, lr + sub r10, r0, #8 + vld1.8 {d16}, [r10, :64], r1 + vld1.8 {d24}, [r0, :64], r1 + vld1.8 {d17}, [r10, :64], r1 + vld1.8 {d25}, [r0, :64], r1 + vld1.8 {d18}, [r10, :64], r1 + vld1.8 {d26}, [r0, :64], r1 + vld1.8 {d19}, [r10, :64], r1 + vld1.8 {d27}, [r0, :64], r1 + vld1.8 {d20}, [r10, :64], r1 + vld1.8 {d28}, [r0, :64], r1 + vld1.8 {d21}, [r10, :64], r1 + vld1.8 {d29}, [r0, :64], r1 + vld1.8 {d22}, [r10, :64], r1 + vld1.8 {d30}, [r0, :64], r1 + vld1.8 {d23}, [r10, :64], r1 + vld1.8 {d31}, [r0, :64], r1 + + transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31 + + lpf_8_wd16 + + sub r0, r0, r1, lsl #3 + sub r10, r0, #8 + + transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5 + transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31 + + vst1.8 {d16}, [r10, :64], r1 + vst1.8 {d6}, [r0, :64], r1 + vst1.8 {d17}, [r10, :64], r1 + vst1.8 {d7}, [r0, :64], r1 + vst1.8 {d0}, [r10, :64], r1 + vst1.8 {d8}, [r0, :64], r1 + vst1.8 {d1}, [r10, :64], r1 + vst1.8 {d9}, [r0, :64], r1 + vst1.8 {d2}, [r10, :64], r1 + vst1.8 {d10}, [r0, :64], r1 + vst1.8 {d3}, [r10, :64], r1 + vst1.8 {d11}, [r0, :64], r1 + vst1.8 {d4}, [r10, :64], r1 + vst1.8 {d30}, [r0, :64], r1 + vst1.8 {d5}, [r10, :64], r1 + vst1.8 {d31}, [r0, :64], r1 + bx r12 + +7: + sub r10, r0, r1, lsl #3 + sub r10, r10, #4 + transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 + add r0, r10, r1, lsl #2 + + vst1.8 {d20}, [r10], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d21}, [r10], r1 + vst1.8 {d25}, [r0], r1 + vst1.8 {d22}, [r10], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d23}, [r10], r1 + vst1.8 {d27}, [r0], r1 + add r0, r0, #4 + bx r12 +8: + sub r10, r0, r1, lsl #3 + sub r10, r10, #2 + transpose_4x8b q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #2 + + vst1.32 {d22[0]}, [r10], r1 + vst1.32 {d22[1]}, [r0], r1 + vst1.32 {d23[0]}, [r10], r1 + vst1.32 {d23[1]}, [r0], r1 + vst1.32 {d24[0]}, [r10], r1 + vst1.32 {d24[1]}, [r0], r1 + vst1.32 {d25[0]}, [r10], r1 + vst1.32 {d25[1]}, [r0], r1 + add r0, r0, #2 + bx r12 +endfunc + +// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [r2] // vmask[0], vmask[1] +.ifc \type, y + ldr r2, [r2, #8] // vmask[2] +.endif + add r5, r5, #128 // Move to sharp part of lut +.ifc \type, y + orr r7, r7, r2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub r4, r3, r4, lsl #2 +.else + sub r3, r3, #4 + lsl r4, r4, #2 +.endif + orr r6, r6, r7 // vmask[0] |= vmask[1] + +1: + tst r6, #0x03 +.ifc \dir, v + vld1.8 {d0}, [r4]! + vld1.8 {d1}, [r3]! +.else + vld2.32 {d0[0], d1[0]}, [r3], r4 + vld2.32 {d0[1], d1[1]}, [r3], r4 +.endif + beq 7f // if (!(vm & bits)) continue; + + vld1.8 {d5[]}, [r5] // sharp[0] + add r5, r5, #8 + vmov.i32 d2, #0xff + vdup.32 d13, r6 // vmask[0] + + vand d0, d0, d2 // Keep only lowest byte in each 32 bit word + vand d1, d1, d2 + vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0] + vmov.i8 d4, #1 + vld1.8 {d6[]}, [r5] // sharp[1] + sub r5, r5, #8 + vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] + vtst.32 d2, d1, d2 // L != 0 + vmul.i32 d1, d1, d4 // L +.ifc \type, y + vdup.32 d15, r2 // vmask[2] +.endif + vdup.32 d14, r7 // vmask[1] + vmov r10, r11, d2 + orrs r10, r10, r11 + beq 7f // if (!L) continue; + vneg.s8 d5, d5 // -sharp[0] + movrel_local r10, word_12 + vshr.u8 d12, d1, #4 // H + vld1.32 {d16}, [r10, :64] + vshl.s8 d3, d1, d5 // L >> sharp[0] +.ifc \type, y + vtst.32 d15, d15, d16 // if (vmask[2] & bits) +.endif + vmov.i8 d7, #2 + vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1]) + vadd.i8 d0, d1, d7 // L + 2 + vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I + vadd.u8 d0, d0, d0 // 2*(L + 2) + vtst.32 d14, d14, d16 // if (vmask[1] & bits) + vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E + vtst.32 d13, d13, d16 // if (vmask[0] & bits) + vand d13, d13, d2 // vmask[0] &= L != 0 + +.ifc \type, y + tst r2, #0x03 + beq 2f + // wd16 + bl lpf_\dir\()_16_8_neon + b 8f +2: +.endif + tst r7, #0x03 + beq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_8_neon +.else + // wd6 + bl lpf_\dir\()_6_8_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_8_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment r0. + // If the whole function is skipped, increment it here instead. + add r0, r0, r1, lsl #3 +.else +7: +.endif +8: + lsrs r6, r6, #2 // vmask[0] >>= 2 + lsr r7, r7, #2 // vmask[1] >>= 2 +.ifc \type, y + lsr r2, r2, #2 // vmask[2] >>= 2 +.endif +.ifc \dir, v + add r0, r0, #8 +.else + // For dir h, r0 is returned incremented +.endif + bne 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_12, align=4 + .word 1, 2 +endconst diff --git a/third_party/dav1d/src/arm/32/loopfilter16.S b/third_party/dav1d/src/arm/32/loopfilter16.S new file mode 100644 index 0000000000..d7daf21f1a --- /dev/null +++ b/third_party/dav1d/src/arm/32/loopfilter16.S @@ -0,0 +1,859 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_4_wd\wd\()_neon + vabd.u16 d0, d22, d23 // abs(p1 - p0) + vabd.u16 d1, d25, d24 // abs(q1 - q0) + vabd.u16 d2, d23, d24 // abs(p0 - q0) + vabd.u16 d3, d22, d25 // abs(p1 - q1) +.if \wd >= 6 + vabd.u16 d4, d21, d22 // abs(p2 - p1) + vabd.u16 d5, d26, d25 // abs(q2 - q1) +.endif +.if \wd >= 8 + vabd.u16 d6, d20, d21 // abs(p3 - p2) + vabd.u16 d7, d27, d26 // abs(q3 - q3) +.endif +.if \wd >= 6 + vmax.u16 d4, d4, d5 +.endif + vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vshr.u16 d3, d3, #1 +.if \wd >= 8 + vmax.u16 d4, d4, d6 +.endif + vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) + vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + vmax.u16 d4, d0, d4 + vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + vand d1, d1, d2 // fm && wd >= 4 (implicit) +.if \wd >= 6 + vmov d14, d1 // fm && wd > 4 (implicit) +.endif +.if \wd >= 16 + vmov d15, d1 // fm && wd == 16 (implicit) +.endif + + vmov r10, r11, d1 + orrs r10, r10, r11 + beq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + vmov.i16 d10, #1 + vabd.u16 d2, d21, d23 // abs(p2 - p0) + vabd.u16 d3, d22, d23 // abs(p1 - p0) + vabd.u16 d4, d25, d24 // abs(q1 - q0) + vabd.u16 d5, d26, d24 // abs(q2 - q0) + vdup.16 d9, r9 // bitdepth_min_8 +.if \wd >= 8 + vabd.u16 d6, d20, d23 // abs(p3 - p0) + vabd.u16 d7, d27, d24 // abs(q3 - q0) +.endif + vmax.u16 d2, d2, d3 + vmax.u16 d4, d4, d5 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vmax.u16 d2, d2, d4 + vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8 +.if \wd >= 8 + vmax.u16 d2, d2, d6 +.endif + +.if \wd == 16 + vabd.u16 d3, d17, d23 // abs(p6 - p0) + vabd.u16 d4, d18, d23 // abs(p5 - p0) + vabd.u16 d5, d19, d23 // abs(p4 - p0) +.endif + vcge.u16 d2, d10, d2 // flat8in +.if \wd == 16 + vabd.u16 d6, d28, d24 // abs(q4 - q0) + vabd.u16 d7, d29, d24 // abs(q5 - q0) + vabd.u16 d8, d30, d24 // abs(q6 - q0) +.endif + vand d14, d2, d14 // flat8in && fm && wd > 4 + vbic d1, d1, d14 // fm && wd >= 4 && !flat8in +.if \wd == 16 + vmax.u16 d3, d3, d4 + vmax.u16 d5, d5, d6 +.endif + vmov r10, r11, d1 +.if \wd == 16 + vmax.u16 d7, d7, d8 + vmax.u16 d3, d3, d5 + vmax.u16 d3, d3, d7 + vcge.u16 d3, d10, d3 // flat8out +.endif + orrs r10, r10, r11 +.if \wd == 16 + vand d15, d15, d3 // flat8out && fm && wd == 16 + vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 + vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out +.endif + beq 1f // skip wd == 4 case +.endif + + vdup.16 d3, r8 // bitdepth_max + vsub.u16 d2, d22, d25 // p1 - q1 + vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1 + vcgt.u16 d0, d0, d12 // hev + vmvn d9, d3 // - 128 * (1 << bitdepth_min_8) + vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1) + vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1) + vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) + vsub.u16 d2, d24, d23 + vmov.i16 d6, #3 + vbic d0, d1, d0 // (fm && wd >= 4 && !hev) + vmul.i16 d2, d2, d6 + vmov.i16 d7, #4 + vadd.i16 d2, d2, d4 + vmin.s16 d2, d2, d3 // f = iclip_diff() + vmax.s16 d2, d2, d9 // f = iclip_diff() + vqadd.s16 d4, d7, d2 // f + 4 + vqadd.s16 d5, d6, d2 // f + 3 + vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) + vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) + vshr.s16 d4, d4, #3 // f1 + vshr.s16 d5, d5, #3 // f2 + vmov.i16 d9, #0 + vdup.16 d3, r8 // bitdepth_max + vqadd.s16 d2, d23, d5 // p0 + f2 + vqsub.s16 d6, d24, d4 // q0 - f1 + vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1 + vmin.s16 d2, d2, d3 // out p0 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q0 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p0 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q0 = iclip_pixel() + vbit d23, d2, d1 // if (fm && wd >= 4) + vbit d24, d6, d1 // if (fm && wd >= 4) + vqadd.s16 d2, d22, d4 // p1 + f + vqsub.s16 d6, d25, d4 // q1 - f + vmin.s16 d2, d2, d3 // out p1 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q1 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p1 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q1 = iclip_pixel() + vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) + vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 2f // skip if there's no flat8in + + vadd.i16 d0, d21, d21 // p2 * 2 + vadd.i16 d2, d21, d22 // p2 + p1 + vadd.i16 d4, d22, d23 // p1 + p0 + vadd.i16 d6, d23, d24 // p0 + q0 + vadd.i16 d8, d0, d2 + vadd.i16 d10, d4, d6 + vadd.i16 d12, d24, d25 // q0 + q1 + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d0 + vadd.i16 d10, d25, d26 // q1 + q2 + vrshr.u16 d0, d8, #3 // out p1 + + vadd.i16 d8, d8, d12 + vsub.i16 d10, d10, d2 + vadd.i16 d12, d26, d26 // q2 + q2 + vrshr.u16 d1, d8, #3 // out p0 + + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d4 + vrshr.u16 d2, d8, #3 // out q0 + + vbit d22, d0, d14 // p1 if (flat8in) + vadd.i16 d8, d8, d12 + vbit d23, d1, d14 // p0 if (flat8in) + vrshr.u16 d3, d8, #3 // out q1 + vbit d24, d2, d14 // q0 if (flat8in) + vbit d25, d3, d14 // q1 if (flat8in) +.elseif \wd >= 8 + vmov r10, r11, d14 + orrs r10, r10, r11 +.if \wd == 8 + beq 8f // skip if there's no flat8in +.else + beq 2f // skip if there's no flat8in +.endif + + vadd.i16 d0, d20, d21 // p3 + p2 + vadd.i16 d2, d22, d25 // p1 + q1 + vadd.i16 d4, d20, d22 // p3 + p1 + vadd.i16 d6, d23, d26 // p0 + q2 + vadd.i16 d8, d0, d0 // 2 * (p3 + p2) + vadd.i16 d9, d23, d24 // p0 + q0 + vadd.i16 d8, d8, d4 // + p3 + p1 + vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2 + vadd.i16 d8, d8, d9 // + p0 + q0 + vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1 + vrshr.u16 d10, d8, #3 // out p2 + + vadd.i16 d8, d8, d2 + vadd.i16 d0, d20, d23 // p3 + p0 + vadd.i16 d2, d24, d27 // q0 + q3 + vrshr.u16 d11, d8, #3 // out p1 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0 + vadd.i16 d4, d21, d24 // p2 + q0 + vadd.i16 d6, d25, d27 // q1 + q3 + vrshr.u16 d12, d8, #3 // out p0 + + vadd.i16 d8, d8, d2 + vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0 + vadd.i16 d0, d22, d25 // p1 + q1 + vadd.i16 d2, d26, d27 // q2 + q3 + vrshr.u16 d13, d8, #3 // out q0 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1 + vrshr.u16 d0, d8, #3 // out q1 + + vadd.i16 d8, d8, d2 + + vbit d21, d10, d14 + vbit d22, d11, d14 + vbit d23, d12, d14 + vrshr.u16 d1, d8, #3 // out q2 + vbit d24, d13, d14 + vbit d25, d0, d14 + vbit d26, d1, d14 +.endif +2: +.if \wd == 16 + vmov r10, r11, d15 + orrs r10, r10, r11 + bne 1f // check if flat8out is needed + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + vadd.i16 d2, d17, d17 // p6 + p6 + vadd.i16 d4, d17, d18 // p6 + p5 + vadd.i16 d6, d17, d19 // p6 + p4 + vadd.i16 d8, d17, d20 // p6 + p3 + vadd.i16 d12, d2, d4 + vadd.i16 d10, d6, d8 + vadd.i16 d6, d17, d21 // p6 + p2 + vadd.i16 d12, d12, d10 + vadd.i16 d8, d17, d22 // p6 + p1 + vadd.i16 d10, d18, d23 // p5 + p0 + vadd.i16 d6, d6, d8 + vadd.i16 d8, d19, d24 // p4 + q0 + vadd.i16 d12, d12, d6 + vadd.i16 d10, d10, d8 + vadd.i16 d6, d20, d25 // p3 + q1 + vadd.i16 d12, d12, d10 + vsub.i16 d6, d6, d2 + vadd.i16 d2, d21, d26 // p2 + q2 + vrshr.u16 d0, d12, #4 // out p5 + vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1) + vsub.i16 d2, d2, d4 + vadd.i16 d4, d22, d27 // p1 + q3 + vadd.i16 d6, d17, d19 // p6 + p4 + vrshr.u16 d1, d12, #4 // out p4 + vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2) + vsub.i16 d4, d4, d6 + vadd.i16 d6, d23, d28 // p0 + q4 + vadd.i16 d8, d17, d20 // p6 + p3 + vrshr.u16 d2, d12, #4 // out p3 + vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3) + vsub.i16 d6, d6, d8 + vadd.i16 d8, d24, d29 // q0 + q5 + vadd.i16 d4, d17, d21 // p6 + p2 + vrshr.u16 d3, d12, #4 // out p2 + vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4) + vsub.i16 d8, d8, d4 + vadd.i16 d6, d25, d30 // q1 + q6 + vadd.i16 d10, d17, d22 // p6 + p1 + vrshr.u16 d4, d12, #4 // out p1 + vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5) + vsub.i16 d6, d6, d10 + vadd.i16 d8, d26, d30 // q2 + q6 + vbif d0, d18, d15 // out p5 + vadd.i16 d10, d18, d23 // p5 + p0 + vrshr.u16 d5, d12, #4 // out p0 + vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6) + vsub.i16 d8, d8, d10 + vadd.i16 d10, d27, d30 // q3 + q6 + vbif d1, d19, d15 // out p4 + vadd.i16 d18, d19, d24 // p4 + q0 + vrshr.u16 d6, d12, #4 // out q0 + vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6) + vsub.i16 d10, d10, d18 + vadd.i16 d8, d28, d30 // q4 + q6 + vbif d2, d20, d15 // out p3 + vadd.i16 d18, d20, d25 // p3 + q1 + vrshr.u16 d7, d12, #4 // out q1 + vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6) + vsub.i16 d18, d8, d18 + vadd.i16 d10, d29, d30 // q5 + q6 + vbif d3, d21, d15 // out p2 + vadd.i16 d20, d21, d26 // p2 + q2 + vrshr.u16 d8, d12, #4 // out q2 + vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6) + vsub.i16 d10, d10, d20 + vadd.i16 d18, d30, d30 // q6 + q6 + vbif d4, d22, d15 // out p1 + vadd.i16 d20, d22, d27 // p1 + q3 + vrshr.u16 d9, d12, #4 // out q3 + vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6) + vsub.i16 d18, d18, d20 + vbif d5, d23, d15 // out p0 + vrshr.u16 d10, d12, #4 // out q4 + vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6) + vrshr.u16 d11, d12, #4 // out q5 + vbif d6, d24, d15 // out q0 + vbif d7, d25, d15 // out q1 + vbif d8, d26, d15 // out q2 + vbif d9, d27, d15 // out q3 + vbif d10, d28, d15 // out q4 + vbif d11, d29, d15 // out q5 +.endif + + bx lr +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + bx r6 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + bx r7 +.endif +9: + // Return directly without writing back any pixels + bx r12 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_4_wd16 + adr r6, 7f + CONFIG_THUMB + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd16_neon +.endm + +.macro lpf_4_wd8 + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd8_neon +.endm + +.macro lpf_4_wd6 + bl lpf_4_wd6_neon +.endm + +.macro lpf_4_wd4 + bl lpf_4_wd4_neon +.endm + +function lpf_v_4_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_4_4_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #1 + vld1.16 {d22}, [r10], r1 + vld1.16 {d24}, [r0], r1 + vld1.16 {d23}, [r10], r1 + vld1.16 {d25}, [r0], r1 + add r0, r0, #4 + + transpose_4x4h q11, q12, d22, d23, d24, d25 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #2 + sub r10, r10, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_6_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + lpf_4_wd6 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_6_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd6 + + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_8_4_neon + mov r12, lr + sub r10, r0, r1, lsl #2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d27}, [r0, :64], r1 // q3 + sub r0, r0, r1, lsl #2 + + lpf_4_wd8 + + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_8_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd8 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_16_4_neon + mov r12, lr + + sub r10, r0, r1, lsl #3 + add r10, r10, r1 + vld1.16 {d17}, [r10, :64], r1 // p6 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d18}, [r10, :64], r1 // p5 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d19}, [r10, :64], r1 // p4 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d27}, [r0, :64], r1 // q3 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d28}, [r0, :64], r1 // q4 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d29}, [r0, :64], r1 // q5 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d30}, [r0, :64], r1 // q6 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + + lpf_4_wd16 + + sub r10, r0, r1, lsl #2 + sub r10, r10, r1, lsl #1 + vst1.16 {d0}, [r10, :64], r1 // p5 + vst1.16 {d6}, [r0, :64], r1 // q0 + vst1.16 {d1}, [r10, :64], r1 // p4 + vst1.16 {d7}, [r0, :64], r1 // q1 + vst1.16 {d2}, [r10, :64], r1 // p3 + vst1.16 {d8}, [r0, :64], r1 // q2 + vst1.16 {d3}, [r10, :64], r1 // p2 + vst1.16 {d9}, [r0, :64], r1 // q3 + vst1.16 {d4}, [r10, :64], r1 // p1 + vst1.16 {d10}, [r0, :64], r1 // q4 + vst1.16 {d5}, [r10, :64], r1 // p0 + vst1.16 {d11}, [r0, :64], r1 // q5 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + bx r12 +7: + sub r10, r0, r1 + sub r10, r10, r1, lsl #1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_16_4_neon + mov r12, lr + sub r10, r0, #16 + sub r0, r0, #8 + vld1.16 {d16}, [r10, :64], r1 + vld1.16 {d20}, [r0, :64], r1 + vld1.16 {d17}, [r10, :64], r1 + vld1.16 {d21}, [r0, :64], r1 + vld1.16 {d18}, [r10, :64], r1 + vld1.16 {d22}, [r0, :64], r1 + vld1.16 {d19}, [r10, :64], r1 + vld1.16 {d23}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vld1.16 {d24}, [r10, :64], r1 + vld1.16 {d28}, [r0, :64], r1 + vld1.16 {d25}, [r10, :64], r1 + vld1.16 {d29}, [r0, :64], r1 + vld1.16 {d26}, [r10, :64], r1 + vld1.16 {d30}, [r0, :64], r1 + vld1.16 {d27}, [r10, :64], r1 + vld1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + lpf_4_wd16 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q8, q0, d16, d17, d0, d1 + transpose_4x4h q1, q2, d2, d3, d4, d5 + transpose_4x4h q3, q4, d6, d7, d8, d9 + transpose_4x4h q5, q15, d10, d11, d30, d31 + sub r10, r0, #16 + sub r0, r0, #8 + + vst1.16 {d16}, [r10, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d17}, [r10, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d0}, [r10, :64], r1 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d1}, [r10, :64], r1 + vst1.16 {d5}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vst1.16 {d6}, [r10, :64], r1 + vst1.16 {d10}, [r0, :64], r1 + vst1.16 {d7}, [r10, :64], r1 + vst1.16 {d11}, [r0, :64], r1 + vst1.16 {d8}, [r10, :64], r1 + vst1.16 {d30}, [r0, :64], r1 + vst1.16 {d9}, [r10, :64], r1 + vst1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + bx r12 + +7: + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded + sub sp, sp, #8 + clz r9, r8 + rsb r9, r9, #24 // bitdepth_min_8 + ldrd r6, r7, [r2] // vmask[0], vmask[1] +.ifc \type, y + ldr r2, [r2, #8] // vmask[2] +.endif + add r5, r5, #128 // Move to sharp part of lut +.ifc \type, y + orr r7, r7, r2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub r4, r3, r4, lsl #2 +.else + sub r3, r3, #4 + lsl r4, r4, #2 +.endif + orr r6, r6, r7 // vmask[0] |= vmask[1] + +1: + tst r6, #0x01 + strd r6, r7, [sp] +.ifc \dir, v + ldrb r10, [r4], #4 + ldrb r11, [r3], #4 +.else + ldrb r10, [r3] + ldrb r11, [r3, #4] + add r3, r3, r4 +.endif + beq 7f // if (!(vm & bits)) continue; + + orrs r12, r10, r11 + vdup.16 d31, r9 // bitdepth_min_8 + beq 7f // if (!(l[0][0] | l[offset][0])) continue; + cmp r11, #0 // Check for nonzero values in l[0][0] + ldrb r6, [r5], #8 // sharp[0] + it eq + moveq r11, r10 // if (!l[0][0]) L = l[offset][0] + ldrb r12, [r5] // sharp[1] + lsr r6, r11, r6 // L >> sharp[0] + sub r5, r5, #8 + cmp r12, r6 + lsr r10, r11, #4 // H + add r11, r11, #2 // L + 2 + it lt + movlt r6, r12 // imin(L >> sharp[0], sharp[1]) + add r11, r11, r11 // 2*(L + 2) + cmp r6, #1 + lsl r10, r10, r9 // H << bitdepth_min_8 + it lt + movlt r6, #1 // imax(imin(), 1) = limit = I + vdup.16 d12, r10 // H << bitdepth_min_8 + add r11, r11, r6 // 2*(L + 2) + limit = E + lsl r6, r6, r9 // I << bitdepth_min_8 + lsl r11, r11, r9 // E << bitdepth_min_8 + vdup.16 d11, r6 // I << bitdepth_min_8 + vdup.16 d10, r11 // E << bitdepth_min_8 + +.ifc \type, y + tst r2, #0x01 + beq 2f + // wd16 + bl lpf_\dir\()_16_4_neon + b 8f +2: +.endif + tst r7, #0x01 + beq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_4_neon +.else + // wd6 + bl lpf_\dir\()_6_4_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_4_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment r0. + // If the whole function is skipped, increment it here instead. + add r0, r0, r1, lsl #2 +.else +7: +.endif +8: + ldrd r6, r7, [sp] +.ifc \type, y + lsr r2, r2, #1 // vmask[2] >>= 1 +.endif +.ifc \dir, v + add r0, r0, #8 +.else + // For dir h, r0 is returned incremented +.endif + lsrs r6, r6, #1 // vmask[0] >>= 1 + lsr r7, r7, #1 // vmask[1] >>= 1 + bne 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S new file mode 100644 index 0000000000..be5c658d6d --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration.S @@ -0,0 +1,791 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[8], intptr_t w, +// int h, enum LrEdgeFlags edges); +function wiener_filter_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + mov r8, r5 + vld1.16 {q0}, [r4, :128] + movw r9, #(1 << 14) - (1 << 2) + vdup.16 q14, r9 + vmov.s16 q15, #2048 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the aligned width from mid_stride + add r11, r5, #7 + bic r11, r11, #7 + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 + sub r3, r3, r11 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #3 + sub lr, lr, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #3 + + +1: // Loop vertically + vld1.8 {q2}, [r2]! + vld1.8 {q9}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[1]}, [r1]! + // Move r2/lr back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub r2, r2, #3 + sub lr, lr, #3 + vld1.32 {d17[1]}, [r1]! + vext.8 q2, q1, q2, #13 + vext.8 q9, q8, q9, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q2 to have 3x the first byte at the front. + vdup.8 q1, d4[0] + vdup.8 q8, d18[0] + // Move r2 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub r2, r2, #3 + sub lr, lr, #3 + vext.8 q2, q1, q2, #13 + vext.8 q9, q8, q9, #13 + +2: + vmovl.u8 q1, d4 + vmovl.u8 q2, d5 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + ldrb r11, [r2, r9] + ldrb r9, [lr, r9] + // Fill q12/q13 with the right padding pixel + vdup.16 q12, r11 + vdup.16 q13, r9 +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q10, q11}, [r4] + + vbit q1, q12, q10 + vbit q2, q12, q11 + vbit q8, q13, q10 + vbit q9, q13, q11 + +4: // Loop horizontally + vext.8 q11, q1, q2, #4 + vext.8 q5, q1, q2, #8 + vext.8 q10, q1, q2, #2 + vext.8 q6, q1, q2, #10 + vext.8 q7, q1, q2, #12 + vext.8 q4, q1, q2, #6 + vadd.i16 q5, q5, q11 + vadd.i16 q6, q6, q10 + vadd.i16 q7, q7, q1 + vmul.s16 q3, q4, d0[3] + vmla.s16 q3, q5, d1[0] + vmla.s16 q3, q6, d1[1] + vmla.s16 q3, q7, d1[2] + + vext.8 q4, q8, q9, #4 + vext.8 q6, q8, q9, #8 + vext.8 q11, q8, q9, #2 + vext.8 q7, q8, q9, #10 + vadd.i16 q6, q6, q4 + vext.8 q4, q8, q9, #12 + vext.8 q5, q8, q9, #6 + vadd.i16 q7, q7, q11 + vadd.i16 q4, q4, q8 + vmul.s16 q10, q5, d0[3] + vmla.s16 q10, q6, d1[0] + vmla.s16 q10, q7, d1[1] + vmla.s16 q10, q4, d1[2] + + vext.8 q1, q1, q2, #6 + vext.8 q8, q8, q9, #6 + vshl.s16 q1, q1, #7 + vshl.s16 q8, q8, #7 + vsub.s16 q1, q1, q14 + vsub.s16 q8, q8, q14 + vqadd.s16 q3, q3, q1 + vqadd.s16 q10, q10, q8 + vshr.s16 q3, q3, #3 + vshr.s16 q10, q10, #3 + vadd.s16 q3, q3, q15 + vadd.s16 q10, q10, q15 + subs r5, r5, #8 + vst1.16 {q3}, [r0, :128]! + vst1.16 {q10}, [r12, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q1, q2 + vmov q8, q9 + vld1.8 {d4}, [r2]! + vld1.8 {d18}, [lr]! + vmovl.u8 q2, d4 + vmovl.u8 q9, d18 + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[8], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride); +function wiener_filter_v_8bpc_neon, export=1 + push {r4-r7,lr} + vpush {q4-q6} + ldrd r4, r5, [sp, #68] + ldrd r6, r7, [sp, #76] + mov lr, r4 + vld1.16 {q0}, [r5, :128] + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vadd.i16 q4, q10, q12 + vadd.i16 q5, q9, q13 + vadd.i16 q6, q8, q14 + vmull.s16 q2, d22, d0[3] + vmlal.s16 q2, d8, d1[0] + vmlal.s16 q2, d10, d1[1] + vmlal.s16 q2, d12, d1[2] + vmull.s16 q3, d23, d0[3] + vmlal.s16 q3, d9, d1[0] + vmlal.s16 q3, d11, d1[1] + vmlal.s16 q3, d13, d1[2] + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vqmovun.s16 d4, q2 + vst1.8 {d4}, [r0, :64], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #8 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + vpop {q4-q6} + pop {r4-r7,pc} +.purgem filter +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Subtract the number of pixels read from the input from the stride + add lr, lr, #8 + sub r4, r4, lr + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #2 + sub r12, r12, #2 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #2 + + +1: // Loop vertically + vld1.8 {q0}, [r3]! + vld1.8 {q4}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[]}, [r2]! + // Move r3/r12 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub r3, r3, #2 + sub r12, r12, #2 + vld1.32 {d11[]}, [r2]! + vext.8 q0, q1, q0, #14 + vext.8 q4, q5, q4, #14 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q0 to have 2x the first byte at the front. + vdup.8 q1, d0[0] + vdup.8 q5, d8[0] + // Move r3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub r3, r3, #2 + sub r12, r12, #2 + vext.8 q0, q1, q0, #14 + vext.8 q4, q5, q4, #14 + +2: + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 2 + 1) + ldrb r11, [r3, lr] + ldrb lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.8 q14, r11 + vdup.8 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #10 + bge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in q0/4.b[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 + + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + +4: // Loop horizontally + vext.8 d16, d0, d1, #1 + vext.8 d17, d0, d1, #2 + vext.8 d18, d8, d9, #1 + vext.8 d19, d8, d9, #2 + vaddl.u8 q3, d0, d16 + vaddw.u8 q3, q3, d17 + vaddl.u8 q7, d8, d18 + vaddw.u8 q7, q7, d19 + + vext.8 q8, q1, q2, #2 + vext.8 q9, q1, q2, #4 + vext.8 q10, q5, q6, #2 + vext.8 q11, q5, q6, #4 + + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddw.u16 q12, q12, d18 + vaddw.u16 q13, q13, d19 + + vaddl.u16 q8, d10, d20 + vaddl.u16 q9, d11, d21 + vaddw.u16 q8, q8, d22 + vaddw.u16 q9, q9, d23 + + subs r5, r5, #8 + vst1.16 {q3}, [r1, :128]! + vst1.16 {q7}, [r11, :128]! + vst1.32 {q12, q13}, [r0, :128]! + vst1.32 {q8, q9}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vld1.8 {d6}, [r3]! + vld1.8 {d14}, [r12]! + vmov q1, q2 + vmov q5, q6 + vext.8 q0, q0, q3, #8 + vext.8 q4, q4, q7, #8 + vmull.u8 q2, d6, d6 + vmull.u8 q6, d14, d14 + + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + add lr, lr, #8 + sub r4, r4, lr + + // Store the width for the vertical loop + mov r8, r5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #3 + sub r12, r12, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #3 + +1: // Loop vertically + vld1.8 {q0}, [r3]! + vld1.8 {q4}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.32 {d3[]}, [r2]! + // Move r3/r12 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub r3, r3, #3 + sub r12, r12, #3 + vld1.32 {d11[]}, [r2]! + vext.8 q0, q1, q0, #13 + vext.8 q4, q5, q4, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost byte + // and shift q0 to have 3x the first byte at the front. + vdup.8 q1, d0[0] + vdup.8 q5, d8[0] + // Move r3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub r3, r3, #3 + sub r12, r12, #3 + vext.8 q0, q1, q0, #13 + vext.8 q4, q5, q4, #13 + +2: + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 3 + 1) + ldrb r11, [r3, lr] + ldrb lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.8 q14, r11 + vdup.8 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -1 + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 + + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + +4: // Loop horizontally + vext.8 d16, d0, d1, #1 + vext.8 d17, d0, d1, #2 + vext.8 d18, d0, d1, #3 + vext.8 d19, d0, d1, #4 + vext.8 d20, d8, d9, #1 + vext.8 d21, d8, d9, #2 + vext.8 d22, d8, d9, #3 + vext.8 d23, d8, d9, #4 + vaddl.u8 q3, d0, d16 + vaddl.u8 q12, d17, d18 + vaddl.u8 q7, d8, d20 + vaddl.u8 q13, d21, d22 + vaddw.u8 q3, q3, d19 + vaddw.u8 q7, q7, d23 + vadd.u16 q3, q3, q12 + vadd.u16 q7, q7, q13 + + vext.8 q8, q1, q2, #2 + vext.8 q9, q1, q2, #4 + vext.8 q10, q1, q2, #6 + vext.8 q11, q1, q2, #8 + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q12, q12, d22 + vaddw.u16 q13, q13, d23 + vadd.i32 q12, q12, q8 + vadd.i32 q13, q13, q9 + vext.8 q8, q5, q6, #2 + vext.8 q9, q5, q6, #4 + vext.8 q10, q5, q6, #6 + vext.8 q11, q5, q6, #8 + vaddl.u16 q1, d10, d16 + vaddl.u16 q5, d11, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q1, q1, d22 + vaddw.u16 q5, q5, d23 + vadd.i32 q10, q1, q8 + vadd.i32 q11, q5, q9 + + subs r5, r5, #8 + vst1.16 {q3}, [r1, :128]! + vst1.16 {q7}, [r11, :128]! + vst1.32 {q12, q13}, [r0, :128]! + vst1.32 {q10, q11}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vld1.8 {d6}, [r3]! + vld1.8 {d14}, [r12]! + vmov q1, q2 + vmov q5, q6 + vext.8 q0, q0, q3, #8 + vext.8 q4, q4, q7, #8 + vmull.u8 q2, d6, d6 + vmull.u8 q6, d14, d14 + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +sgr_funcs 8 diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S new file mode 100644 index 0000000000..d699617a87 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration16.S @@ -0,0 +1,801 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + ldr r8, [sp, #116] // bitdepth_max + vld1.16 {q0}, [r4, :128] + clz r8, r8 + vmov.i32 q14, #1 + sub r9, r8, #38 // -(bitdepth + 6) + sub r8, r8, #25 // -round_bits_h + neg r9, r9 // bitdepth + 6 + vdup.32 q1, r9 + vdup.32 q13, r8 // -round_bits_h + vmov.i16 q15, #8192 + vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) + mov r8, r5 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the aligned width from mid_stride + add r11, r5, #7 + bic r11, r11, #7 + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 + sub r3, r3, r11, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #6 + sub lr, lr, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #6 + + +1: // Loop vertically + vld1.16 {q2, q3}, [r2]! + vld1.16 {q4, q5}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d3}, [r1]! + // Move r2/lr back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r2, r2, #6 + sub lr, lr, #6 + vld1.16 {d13}, [r1]! + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost pixel + // and shift q2/q3 to have 3x the first pixel at the front. + vdup.16 q1, d4[0] + vdup.16 q6, d8[0] + // Move r2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r2, r2, #6 + sub lr, lr, #6 + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + lsl r9, r9, #1 + ldrh r11, [r2, r9] + ldrh r9, [lr, r9] + // Fill q11/q12 with the right padding pixel + vdup.16 q11, r11 + vdup.16 q12, r9 +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q9, q10}, [r4] + + vbit q2, q11, q9 + vbit q3, q11, q10 + vbit q4, q12, q9 + vbit q5, q12, q10 + +4: // Loop horizontally + vext.8 q7, q2, q3, #4 + vext.8 q8, q2, q3, #8 + vext.8 q6, q2, q3, #2 + vext.8 q9, q2, q3, #10 + vadd.i16 q8, q8, q7 + vadd.i16 q9, q9, q6 + vext.8 q6, q2, q3, #12 + vext.8 q7, q2, q3, #6 + vadd.i16 q2, q2, q6 + vmull.s16 q6, d14, d0[3] + vmlal.s16 q6, d16, d1[0] + vmlal.s16 q6, d18, d1[1] + vmlal.s16 q6, d4, d1[2] + vmull.s16 q7, d15, d0[3] + vmlal.s16 q7, d17, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d5, d1[2] + + vext.8 q8, q4, q5, #4 + vext.8 q10, q4, q5, #8 + vext.8 q9, q4, q5, #2 + vext.8 q2, q4, q5, #10 + vadd.i16 q10, q10, q8 + vadd.i16 q2, q2, q9 + vext.8 q8, q4, q5, #12 + vext.8 q9, q4, q5, #6 + vadd.i16 q4, q4, q8 + vmull.s16 q8, d18, d0[3] + vmlal.s16 q8, d20, d1[0] + vmlal.s16 q8, d4, d1[1] + vmlal.s16 q8, d8, d1[2] + vmull.s16 q9, d19, d0[3] + vmlal.s16 q9, d21, d1[0] + vmlal.s16 q9, d5, d1[1] + vmlal.s16 q9, d9, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vadd.i32 q8, q8, q14 + vadd.i32 q9, q9, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vrshl.s32 q8, q8, q13 + vrshl.s32 q9, q9, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vqmovun.s32 d14, q8 + vqmovun.s32 d15, q9 + vmin.u16 q6, q6, q10 + vmin.u16 q7, q7, q10 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + subs r5, r5, #8 + vst1.16 {q6}, [r0, :128]! + vst1.16 {q7}, [r12, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q2, q3 + vmov q4, q5 + vld1.16 {q3}, [r2]! + vld1.16 {q5}, [lr]! + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q4-q5} + ldrd r4, r5, [sp, #52] + ldrd r6, r7, [sp, #60] + ldr lr, [sp, #68] // bitdepth_max + vld1.16 {q0}, [r5, :128] + vdup.16 q5, lr + clz lr, lr + sub lr, lr, #11 // round_bits_v + vdup.32 q4, lr + mov lr, r4 + vneg.s32 q4, q4 // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vrshl.s32 q2, q2, q4 // round_bits_v + vrshl.s32 q3, q3, q4 + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q5 // bitdepth_max + vst1.16 {q2}, [r0, :128], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-q15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #16 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + vpop {q4-q5} + pop {r4-r7,pc} +.purgem filter +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Subtract the number of pixels read from the input from the stride + add lr, lr, #8 + sub r4, r4, lr, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #4 + sub r12, r12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #4 + + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #4 + sub r12, r12, #4 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 2x the first byte at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub r3, r3, #4 + sub r12, r12, #4 + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 2 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #10 + bge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 + +4: // Loop horizontally + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + subs r5, r5, #8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + add lr, lr, #8 + sub r4, r4, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #6 + sub r12, r12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #6 + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #6 + sub r12, r12, #6 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 3x the first pixel at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r3, r3, #6 + sub r12, r12, #6 + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 3 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -2 + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 + +4: // Loop horizontally + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + + vext.8 q8, q0, q1, #6 + vext.8 q10, q4, q5, #6 + vext.8 q9, q0, q1, #8 + vext.8 q11, q4, q5, #8 + vadd.i16 q2, q2, q8 + vadd.i16 q3, q3, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d1, d1 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d9, d9 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + + subs r5, r5, #8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +sgr_funcs 16 diff --git a/third_party/dav1d/src/arm/32/looprestoration_common.S b/third_party/dav1d/src/arm/32/looprestoration_common.S new file mode 100644 index 0000000000..b080bb5115 --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration_common.S @@ -0,0 +1,453 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #2 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 1f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Sum all h+2 lines with the main loop + add lr, lr, #2 +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q8-q13 and q0-q2 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q8, q9}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q1}, [r6, :128], r8 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q10, q8 + vmov q11, q9 + vmov q1, q0 + vmov q12, q8 + vmov q13, q9 + vmov q2, q0 + +3: + subs r3, r3, #1 +.macro add3 + vadd.i32 q8, q8, q10 + vadd.i32 q9, q9, q11 + vadd.i16 q0, q0, q1 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i16 q0, q0, q2 + vst1.32 {q8, q9}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + vmov q10, q12 + vmov q11, q13 + vmov q1, q2 + ble 4f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3b + +4: + tst r4, #8 // LR_HAVE_BOTTOM + bne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + add3 + +5: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + pop {r4-r9,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + push {r4-r9,lr} + vpush {q5-q7} + ldr r4, [sp, #76] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #8 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 0f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Handle h+2 lines with the main loop + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub r3, r3, #1 // Handle h-1 lines with the main loop +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q6-q15 and q0-q3,q5 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q6, q7}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vmov q10, q6 + vmov q11, q7 + vmov q2, q0 + vmov q12, q6 + vmov q13, q7 + vmov q3, q0 + +3: + cmp r3, #0 + beq 4f + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + +3: + // Start of vertical loop + subs r3, r3, #2 +.macro add5 + vadd.i32 q6, q6, q8 + vadd.i32 q7, q7, q9 + vadd.i16 q0, q0, q1 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i16 q0, q0, q2 + vadd.i32 q6, q6, q12 + vadd.i32 q7, q7, q13 + vadd.i16 q0, q0, q3 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q15 + vadd.i16 q0, q0, q5 + vst1.32 {q6, q7}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add5 +.macro shift2 + vmov q6, q10 + vmov q7, q11 + vmov q0, q2 + vmov q8, q12 + vmov q9, q13 + vmov q1, q3 + vmov q10, q14 + vmov q11, q15 + vmov q2, q5 +.endm + shift2 + add r0, r0, r7 + add r1, r1, r8 + ble 5f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + add5 + b 6f + +5: + tst r4, #8 // LR_HAVE_BOTTOM + bne 6f + // !LR_HAVE_BOTTOM + cmp r3, #0 + bne 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + // Pad the past-edge row from the last content row. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // r3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + vmov q12, q10 + vmov q13, q11 + vmov q3, q2 + vmov q14, q10 + vmov q15, q11 + vmov q5, q2 + add5 + add r0, r0, r7 + add r1, r1, r8 + b 6f + +6: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + vpop {q5-q7} + pop {r4-r9,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #2 // h += 2 + clz r6, r5 + vmov.i32 q15, #9 // n + movw r5, #455 + mov lr, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #3 // h += 3 + clz r6, r5 + asr r3, r3, #1 // h /= 2 + vmov.i32 q15, #25 // n + mov r5, #164 + mov lr, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel r12, X(sgr_x_by_x) + sub r6, r6, #24 // -bitdepth_min_8 + vld1.8 {q8, q9}, [r12, :128]! + add r7, r6, r6 // -2*bitdepth_min_8 + vmov.i8 q11, #5 + vmov.i8 d10, #55 // idx of last 5 + vld1.8 {q10}, [r12, :128] + vmov.i8 d11, #72 // idx of last 4 + vmov.i8 d12, #101 // idx of last 3 + vmov.i8 d13, #169 // idx of last 2 + vmov.i8 d14, #254 // idx of last 1 + vmov.i8 d15, #32 // elements consumed in first vtbl + add r2, r2, #2 // w += 2 + add r12, r2, #7 + bic r12, r12, #7 // aligned w + sub r12, lr, r12 // increment between rows + vdup.32 q12, r4 + sub r0, r0, #(4*(SUM_STRIDE)) + sub r1, r1, #(2*(SUM_STRIDE)) + mov r4, r2 // backup of w + vsub.i8 q8, q8, q11 + vsub.i8 q9, q9, q11 + vsub.i8 q10, q10, q11 +1: + vld1.32 {q0, q1}, [r0, :128] // a + vld1.16 {q2}, [r1, :128] // b + vdup.32 q13, r7 // -2*bitdepth_min_8 + vdup.16 q14, r6 // -bitdepth_min_8 + subs r2, r2, #8 + vrshl.s32 q0, q0, q13 + vrshl.s32 q1, q1, q13 + vrshl.s16 q4, q2, q14 + vmul.i32 q0, q0, q15 // a * n + vmul.i32 q1, q1, q15 // a * n + vmull.u16 q3, d8, d8 // b * b + vmull.u16 q4, d9, d9 // b * b + vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) + vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) + vmul.i32 q0, q0, q12 // p * s + vmul.i32 q1, q1, q12 // p * s + vqshrn.u32 d0, q0, #16 + vqshrn.u32 d1, q1, #16 + vqrshrn.u16 d0, q0, #4 // imin(z, 255) + + vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 + vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 + vtbl.8 d1, {q8, q9}, d0 + vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 + vsub.i8 d9, d0, d15 // indices for vtbx + vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 + vadd.i8 d2, d2, d3 + vtbx.8 d1, {q10}, d9 + vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 + vadd.i8 d6, d6, d7 + vadd.i8 d8, d8, d22 + vadd.i8 d2, d2, d6 + vadd.i8 d1, d1, d8 + vadd.i8 d1, d1, d2 + vmovl.u8 q0, d1 // x + + vmov.i16 q13, #256 + vdup.32 q14, r5 // one_by_x + + vmull.u16 q1, d0, d4 // x * BB[i] + vmull.u16 q2, d1, d5 // x * BB[i] + vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x + vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x + vrshr.s32 q1, q1, #12 // AA[i] + vrshr.s32 q2, q2, #12 // AA[i] + vsub.i16 q0, q13, q0 // 256 - x + + vst1.32 {q1, q2}, [r0, :128]! + vst1.16 {q0}, [r1, :128]! + bgt 1b + + subs r3, r3, #1 + ble 0f + add r0, r0, r12, lsl #2 + add r1, r1, r12, lsl #1 + mov r2, r4 + b 1b +0: + vpop {q4-q7} + pop {r4-r7,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/looprestoration_tmpl.S b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S new file mode 100644 index 0000000000..8a9940bb3a --- /dev/null +++ b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S @@ -0,0 +1,600 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +.macro sgr_funcs bpc +// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + sub r7, r3, #(4*SUM_STRIDE) + add r8, r3, #(4*SUM_STRIDE) + sub r9, r4, #(2*SUM_STRIDE) + add r10, r4, #(2*SUM_STRIDE) + mov r11, #SUM_STRIDE + mov r12, #FILTER_OUT_STRIDE + add lr, r5, #3 + bic lr, lr, #3 // Aligned width +.if \bpc == 8 + sub r2, r2, lr +.else + sub r2, r2, lr, lsl #1 +.endif + sub r12, r12, lr + sub r11, r11, lr + sub r11, r11, #4 // We read 4 extra elements from both a and b + mov lr, r5 + vmov.i16 q14, #3 + vmov.i32 q15, #3 +1: + vld1.16 {q0}, [r9, :128]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r10, :128]! + vld1.32 {q8, q9}, [r7, :128]! + vld1.32 {q10, q11}, [r3, :128]! + vld1.32 {q12, q13}, [r8, :128]! + +2: + subs r5, r5, #4 + vext.8 d6, d0, d1, #2 // -stride + vext.8 d7, d2, d3, #2 // 0 + vext.8 d8, d4, d5, #2 // +stride + vext.8 d9, d0, d1, #4 // +1-stride + vext.8 d10, d2, d3, #4 // +1 + vext.8 d11, d4, d5, #4 // +1+stride + vadd.i16 d2, d2, d6 // -1, -stride + vadd.i16 d7, d7, d8 // 0, +stride + vadd.i16 d0, d0, d9 // -1-stride, +1-stride + vadd.i16 d2, d2, d7 + vadd.i16 d4, d4, d11 // -1+stride, +1+stride + vadd.i16 d2, d2, d10 // +1 + vadd.i16 d0, d0, d4 + + vext.8 q3, q8, q9, #4 // -stride + vshl.i16 d2, d2, #2 + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q10, q11, #4 // 0 + vext.8 q6, q10, q11, #8 // +1 + vmla.i16 d2, d0, d28 // * 3 -> a + vadd.i32 q3, q3, q10 // -stride, -1 + vadd.i32 q8, q8, q4 // -1-stride, +1-stride + vadd.i32 q5, q5, q6 // 0, +1 + vadd.i32 q8, q8, q12 // -1+stride + vadd.i32 q3, q3, q5 + vext.8 q7, q12, q13, #4 // +stride + vext.8 q10, q12, q13, #8 // +1+stride +.if \bpc == 8 + vld1.32 {d24[0]}, [r1, :32]! // src +.else + vld1.16 {d24}, [r1, :64]! // src +.endif + vadd.i32 q3, q3, q7 // +stride + vadd.i32 q8, q8, q10 // +1+stride + vshl.i32 q3, q3, #2 + vmla.i32 q3, q8, q15 // * 3 -> b +.if \bpc == 8 + vmovl.u8 q12, d24 // src +.endif + vmov d0, d1 + vmlal.u16 q3, d2, d24 // b + a * src + vmov d2, d3 + vrshrn.i32 d6, q3, #9 + vmov d4, d5 + vst1.16 {d6}, [r0]! + + ble 3f + vmov q8, q9 + vmov q10, q11 + vmov q12, q13 + vld1.16 {d1}, [r9, :64]! + vld1.16 {d3}, [r4, :64]! + vld1.16 {d5}, [r10, :64]! + vld1.32 {q9}, [r7, :128]! + vld1.32 {q11}, [r3, :128]! + vld1.32 {q13}, [r8, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r12, lsl #1 + add r1, r1, r2 + add r3, r3, r11, lsl #2 + add r7, r7, r11, lsl #2 + add r8, r8, r11, lsl #2 + add r4, r4, r11, lsl #1 + add r9, r9, r11, lsl #1 + add r10, r10, r11, lsl #1 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + add r7, r3, #(4*(SUM_STRIDE)) + sub r3, r3, #(4*(SUM_STRIDE)) + add r8, r4, #(2*(SUM_STRIDE)) + sub r4, r4, #(2*(SUM_STRIDE)) + mov r9, #(2*SUM_STRIDE) + mov r10, #FILTER_OUT_STRIDE + add r11, r5, #7 + bic r11, r11, #7 // Aligned width +.if \bpc == 8 + sub r2, r2, r11 +.else + sub r2, r2, r11, lsl #1 +.endif + sub r10, r10, r11 + sub r9, r9, r11 + sub r9, r9, #4 // We read 4 extra elements from a + sub r12, r9, #4 // We read 8 extra elements from b + mov lr, r5 + +1: + vld1.16 {q0, q1}, [r4, :128]! + vld1.16 {q2, q3}, [r8, :128]! + vld1.32 {q8, q9}, [r3, :128]! + vld1.32 {q11, q12}, [r7, :128]! + vld1.32 {q10}, [r3, :128]! + vld1.32 {q13}, [r7, :128]! + +2: + vmov.i16 q14, #5 + vmov.i16 q15, #6 + subs r5, r5, #8 + vext.8 q4, q0, q1, #4 // +1-stride + vext.8 q5, q2, q3, #4 // +1+stride + vext.8 q6, q0, q1, #2 // -stride + vext.8 q7, q2, q3, #2 // +stride + vadd.i16 q0, q0, q4 // -1-stride, +1-stride + vadd.i16 q5, q2, q5 // -1+stride, +1+stride + vadd.i16 q2, q6, q7 // -stride, +stride + vadd.i16 q0, q0, q5 + + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q9, q10, #8 + vext.8 q6, q11, q12, #8 // +1+stride + vext.8 q7, q12, q13, #8 + vmul.i16 q0, q0, q14 // * 5 + vmla.i16 q0, q2, q15 // * 6 + vadd.i32 q4, q4, q8 // -1-stride, +1-stride + vadd.i32 q5, q5, q9 + vadd.i32 q6, q6, q11 // -1+stride, +1+stride + vadd.i32 q7, q7, q12 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q7 + vext.8 q6, q8, q9, #4 // -stride + vext.8 q7, q9, q10, #4 + vext.8 q8, q11, q12, #4 // +stride + vext.8 q11, q12, q13, #4 + +.if \bpc == 8 + vld1.8 {d4}, [r1, :64]! +.else + vld1.8 {q2}, [r1, :128]! +.endif + + vmov.i32 q14, #5 + vmov.i32 q15, #6 + + vadd.i32 q6, q6, q8 // -stride, +stride + vadd.i32 q7, q7, q11 + vmul.i32 q4, q4, q14 // * 5 + vmla.i32 q4, q6, q15 // * 6 + vmul.i32 q5, q5, q14 // * 5 + vmla.i32 q5, q7, q15 // * 6 + +.if \bpc == 8 + vmovl.u8 q2, d4 +.endif + vmlal.u16 q4, d0, d4 // b + a * src + vmlal.u16 q5, d1, d5 // b + a * src + vmov q0, q1 + vrshrn.i32 d8, q4, #9 + vrshrn.i32 d9, q5, #9 + vmov q2, q3 + vst1.16 {q4}, [r0, :128]! + + ble 3f + vmov q8, q10 + vmov q11, q13 + vld1.16 {q1}, [r4, :128]! + vld1.16 {q3}, [r8, :128]! + vld1.32 {q9, q10}, [r3, :128]! + vld1.32 {q12, q13}, [r7, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r10, lsl #1 + add r1, r1, r2 + add r3, r3, r9, lsl #2 + add r7, r7, r9, lsl #2 + add r4, r4, r12, lsl #1 + add r8, r8, r12, lsl #1 + + vld1.32 {q8, q9}, [r3, :128]! + vld1.16 {q0, q1}, [r4, :128]! + vld1.32 {q10}, [r3, :128]! + + vmov.i16 q12, #5 + vmov.i16 q13, #6 + +4: + subs r5, r5, #8 + vext.8 q3, q0, q1, #4 // +1 + vext.8 q2, q0, q1, #2 // 0 + vadd.i16 q0, q0, q3 // -1, +1 + + vext.8 q4, q8, q9, #4 // 0 + vext.8 q5, q9, q10, #4 + vext.8 q6, q8, q9, #8 // +1 + vext.8 q7, q9, q10, #8 + vmul.i16 q2, q2, q13 // * 6 + vmla.i16 q2, q0, q12 // * 5 -> a +.if \bpc == 8 + vld1.8 {d22}, [r1, :64]! +.else + vld1.16 {q11}, [r1, :128]! +.endif + vadd.i32 q8, q8, q6 // -1, +1 + vadd.i32 q9, q9, q7 +.if \bpc == 8 + vmovl.u8 q11, d22 +.endif + vmul.i32 q4, q4, q15 // * 6 + vmla.i32 q4, q8, q14 // * 5 -> b + vmul.i32 q5, q5, q15 // * 6 + vmla.i32 q5, q9, q14 // * 5 -> b + + vmlal.u16 q4, d4, d22 // b + a * src + vmlal.u16 q5, d5, d23 + vmov q0, q1 + vrshrn.i32 d8, q4, #8 + vrshrn.i32 d9, q5, #8 + vmov q8, q10 + vst1.16 {q4}, [r0, :128]! + + ble 5f + vld1.16 {q1}, [r4, :128]! + vld1.32 {q9, q10}, [r3, :128]! + b 4b + +5: + subs r6, r6, #1 + ble 0f + mov r5, lr + sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started + sub r4, r4, r11, lsl #1 + add r0, r0, r10, lsl #1 + add r1, r1, r2 + sub r3, r3, #16 + sub r4, r4, #16 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt, const int bitdepth_max); +function sgr_weighted1_\bpc\()bpc_neon, export=1 + push {r4-r9,lr} + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] +.if \bpc == 16 + ldr r8, [sp, #44] +.endif + vdup.16 d31, r7 + cmp r6, #2 +.if \bpc == 16 + vdup.16 q14, r8 +.endif + add r9, r0, r1 + add r12, r2, r3 + add lr, r4, #2*FILTER_OUT_STRIDE + mov r7, #(4*FILTER_OUT_STRIDE) + lsl r1, r1, #1 + lsl r3, r3, #1 + add r8, r5, #7 + bic r8, r8, #7 // Aligned width +.if \bpc == 8 + sub r1, r1, r8 + sub r3, r3, r8 +.else + sub r1, r1, r8, lsl #1 + sub r3, r3, r8, lsl #1 +.endif + sub r7, r7, r8, lsl #1 + mov r8, r5 + blt 2f +1: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r12, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r12, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [lr, :128]! + subs r5, r5, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q9, q9, q8 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vshll.u16 q10, d16, #7 // u << 7 + vshll.u16 q11, d17, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vmlal.s16 q10, d18, d31 // v + vmlal.s16 q11, d19, d31 // v +.if \bpc == 8 + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vrshrn.i32 d20, q10, #11 + vrshrn.i32 d21, q11, #11 + vqmovun.s16 d4, q2 + vqmovun.s16 d20, q10 + vst1.8 {d4}, [r0, :64]! + vst1.8 {d20}, [r9, :64]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vqrshrun.s32 d20, q10, #11 + vqrshrun.s32 d21, q11, #11 + vmin.u16 q2, q2, q14 + vmin.u16 q10, q10, q14 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q10}, [r9, :128]! +.endif + bgt 1b + + sub r6, r6, #2 + cmp r6, #1 + blt 0f + mov r5, r8 + add r0, r0, r1 + add r9, r9, r1 + add r2, r2, r3 + add r12, r12, r3 + add r4, r4, r7 + add lr, lr, r7 + beq 2f + b 1b + +2: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + subs r5, r5, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v +.if \bpc == 8 + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vqmovun.s16 d2, q2 + vst1.8 {d2}, [r0, :64]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vmin.u16 q2, q2, q14 + vst1.16 {q2}, [r0, :128]! +.endif + bgt 2b +0: + pop {r4-r9,pc} +endfunc + +// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2], const int bitdepth_max); +function sgr_weighted2_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.if \bpc == 8 + ldr r8, [sp, #52] +.else + ldrd r8, r9, [sp, #52] +.endif + cmp r7, #2 + add r10, r0, r1 + add r11, r2, r3 + add r12, r4, #2*FILTER_OUT_STRIDE + add lr, r5, #2*FILTER_OUT_STRIDE + vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] +.if \bpc == 16 + vdup.16 q14, r9 +.endif + mov r8, #4*FILTER_OUT_STRIDE + lsl r1, r1, #1 + lsl r3, r3, #1 + add r9, r6, #7 + bic r9, r9, #7 // Aligned width +.if \bpc == 8 + sub r1, r1, r9 + sub r3, r3, r9 +.else + sub r1, r1, r9, lsl #1 + sub r3, r3, r9, lsl #1 +.endif + sub r8, r8, r9, lsl #1 + mov r9, r6 + blt 2f +1: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r11, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r11, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [r12, :128]! + vld1.16 {q2}, [r5, :128]! + vld1.16 {q10}, [lr, :128]! + subs r6, r6, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vsub.i16 q9, q9, q8 // t1 - u + vsub.i16 q10, q10, q8 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vshll.u16 q11, d16, #7 // u << 7 + vshll.u16 q8, d17, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) + vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) + vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) + vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) +.if \bpc == 8 + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vrshrn.i32 d22, q11, #11 + vrshrn.i32 d23, q8, #11 + vqmovun.s16 d6, q3 + vqmovun.s16 d22, q11 + vst1.8 {d6}, [r0, :64]! + vst1.8 {d22}, [r10, :64]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vqrshrun.s32 d22, q11, #11 + vqrshrun.s32 d23, q8, #11 + vmin.u16 q3, q3, q14 + vmin.u16 q11, q11, q14 + vst1.16 {q3}, [r0, :128]! + vst1.16 {q11}, [r10, :128]! +.endif + bgt 1b + + subs r7, r7, #2 + cmp r7, #1 + blt 0f + mov r6, r9 + add r0, r0, r1 + add r10, r10, r1 + add r2, r2, r3 + add r11, r11, r3 + add r4, r4, r8 + add r12, r12, r8 + add r5, r5, r8 + add lr, lr, r8 + beq 2f + b 1b + +2: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r5, :128]! + subs r6, r6, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) +.if \bpc == 8 + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vqmovun.s16 d6, q3 + vst1.8 {d6}, [r0, :64]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vmin.u16 q3, q3, q14 + vst1.16 {q3}, [r0, :128]! +.endif + bgt 1b +0: + pop {r4-r11,pc} +endfunc +.endm diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S new file mode 100644 index 0000000000..1b60a7bdb3 --- /dev/null +++ b/third_party/dav1d/src/arm/32/mc.S @@ -0,0 +1,3340 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro avg dst0, dst1, t0, t1, t2, t3 + vld1.16 {\t0,\t1}, [r2, :128]! + vld1.16 {\t2,\t3}, [r3, :128]! + vadd.i16 \t0, \t0, \t2 + vadd.i16 \t1, \t1, \t3 + vqrshrun.s16 \dst0, \t0, #5 + vqrshrun.s16 \dst1, \t1, #5 +.endm + +.macro w_avg dst0, dst1, t0, t1, t2, t3 + vld1.16 {\t0,\t1}, [r2, :128]! + vld1.16 {\t2,\t3}, [r3, :128]! + vsub.i16 \t0, \t2, \t0 + vsub.i16 \t1, \t3, \t1 + vqdmulh.s16 \t0, \t0, q15 + vqdmulh.s16 \t1, \t1, q15 + vadd.i16 \t0, \t2, \t0 + vadd.i16 \t1, \t3, \t1 + vqrshrun.s16 \dst0, \t0, #4 + vqrshrun.s16 \dst1, \t1, #4 +.endm + +.macro mask dst0, dst1, t0, t1, t2, t3 + vld1.8 {q14}, [lr, :128]! + vld1.16 {\t0,\t1}, [r2, :128]! + vmul.i8 q14, q14, q15 + vld1.16 {\t2,\t3}, [r3, :128]! + vshll.i8 q13, d28, #8 + vshll.i8 q14, d29, #8 + vsub.i16 \t0, \t2, \t0 + vsub.i16 \t1, \t3, \t1 + vqdmulh.s16 \t0, \t0, q13 + vqdmulh.s16 \t1, \t1, q14 + vadd.i16 \t0, \t2, \t0 + vadd.i16 \t1, \t3, \t1 + vqrshrun.s16 \dst0, \t0, #4 + vqrshrun.s16 \dst1, \t1, #4 +.endm + +.macro bidir_fn type +function \type\()_8bpc_neon, export=1 + push {r4-r6,lr} + ldrd r4, r5, [sp, #16] + clz r4, r4 +.ifnc \type, avg + ldr lr, [sp, #24] +.endif +.ifc \type, w_avg + vdup.s16 q15, lr + vneg.s16 q15, q15 + vshl.i16 q15, q15, #11 +.endif +.ifc \type, mask + vmov.i8 q15, #256-2 +.endif + adr r12, L(\type\()_tbl) + sub r4, r4, #24 + ldr r4, [r12, r4, lsl #2] + \type d16, d17, q0, q1, q2, q3 + add r12, r12, r4 + bx r12 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 4f - L(\type\()_tbl) + CONFIG_THUMB + +4: + add r6, r0, r1 + lsl r1, r1, #1 + cmp r5, #4 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + beq 0f + \type d18, d19, q0, q1, q2, q3 + cmp r5, #8 + vst1.32 {d18[0]}, [r0, :32], r1 + vst1.32 {d18[1]}, [r6, :32], r1 + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d19[1]}, [r6, :32], r1 + beq 0f + \type d16, d17, q0, q1, q2, q3 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + \type d18, d19, q0, q1, q2, q3 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + vst1.32 {d18[0]}, [r0, :32], r1 + vst1.32 {d18[1]}, [r6, :32], r1 + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d19[1]}, [r6, :32], r1 + pop {r4-r6,pc} +80: + add r6, r0, r1 + lsl r1, r1, #1 +8: + vst1.8 {d16}, [r0, :64], r1 + \type d18, d19, q0, q1, q2, q3 + vst1.8 {d17}, [r6, :64], r1 + vst1.8 {d18}, [r0, :64], r1 + subs r5, r5, #4 + vst1.8 {d19}, [r6, :64], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 8b +160: + add r6, r0, r1 + lsl r1, r1, #1 +16: + \type d18, d19, q0, q1, q2, q3 + vst1.8 {q8}, [r0, :128], r1 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q9}, [r6, :128], r1 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q10}, [r0, :128], r1 + subs r5, r5, #4 + vst1.8 {q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 16b +320: + add r6, r0, r1 + lsl r1, r1, #1 +32: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #2 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 32b +640: + add r6, r0, #32 +64: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d16, d17, q0, q1, q2, q3 + vst1.8 {q10, q11}, [r6, :128], r1 + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128], r1 + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #2 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 64b +1280: + sub r1, r1, #32 + add r6, r0, #64 +128: + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + \type d22, d23, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r0, :128]! + \type d16, d17, q0, q1, q2, q3 + vst1.8 {q10, q11}, [r0, :128], r1 + \type d18, d19, q0, q1, q2, q3 + \type d20, d21, q0, q1, q2, q3 + vst1.8 {q8, q9}, [r6, :128]! + \type d22, d23, q0, q1, q2, q3 + subs r5, r5, #1 + vst1.8 {q10, q11}, [r6, :128], r1 + ble 0f + \type d16, d17, q0, q1, q2, q3 + b 128b + +0: + pop {r4-r6,pc} +endfunc +.endm + +bidir_fn avg +bidir_fn w_avg +bidir_fn mask + + +.macro w_mask_fn type +function w_mask_\type\()_8bpc_neon, export=1 + push {r4-r9,lr} + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] + clz r8, r4 + adr r9, L(w_mask_\type\()_tbl) + sub r8, r8, #24 + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + movw r12, #6903 + vdup.16 q14, r12 +.if \type == 444 + vmov.i8 q15, #64 +.elseif \type == 422 + vdup.8 d0, r7 // d0[] <- sign + vmov.i8 d30, #129 + vsub.i8 d30, d30, d0 // 129 - sign +.elseif \type == 420 + vdup.16 q0, r7 // d0[] <- sign + vmov.i16 q15, #256 + vsub.i16 q15, q15, q0 // 256 - sign +.endif + add r12, r0, r1 + lsl r1, r1, #1 + bx r9 + + .align 2 +L(w_mask_\type\()_tbl): + .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + +4: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once) + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once) + subs r5, r5, #4 + vsub.i16 q8, q2, q0 // tmp2-tmp1 + vsub.i16 q9, q3, q1 + vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x])) + vabd.s16 q11, q1, q3 + vqsub.u16 q10, q14, q10 // 6903 - abs () + vqsub.u16 q11, q14, q11 + vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8 + vshr.s16 q11, q11, #8 + vshl.s16 q12, q10, #9 // (64-m)<<9 + vshl.s16 q13, q11, #9 + vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15 + vqdmulh.s16 q13, q13, q9 + vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1 + vadd.i16 q13, q13, q1 + vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - m + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // m + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition) + vpadd.s16 d21, d22, d23 + vmovn.s16 d6, q10 + vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1 + vst1.8 {d6}, [r6, :64]! +.elseif \type == 420 + vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition) + vadd.s16 d21, d22, d23 + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d20[0]}, [r6, :32]! +.endif + vst1.32 {d24[0]}, [r0, :32], r1 + vst1.32 {d24[1]}, [r12, :32], r1 + vst1.32 {d25[0]}, [r0, :32], r1 + vst1.32 {d25[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r9,pc} +8: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2 + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2 + subs r5, r5, #2 + vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1 + vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2 + vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1) + vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2) + vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) + vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2) + vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 + vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8 + vshl.s16 q12, q10, #9 // (64 - my1) << 9 + vshl.s16 q13, q11, #9 // (64 - my2) << 9 + vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 + vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 + vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 + vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 + vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - m + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // m + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) + vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2) + vmovn.s16 d20, q10 + vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1 + vst1.8 {d20}, [r6, :64]! +.elseif \type == 420 + vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d20[0]}, [r6, :32]! +.endif + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d25}, [r12, :64], r1 + bgt 8b + pop {r4-r9,pc} +1280: +640: +320: +160: + sub r1, r1, r4 +.if \type == 444 + add lr, r6, r4 +.elseif \type == 422 + add lr, r6, r4, lsr #1 +.endif + add r9, r3, r4, lsl #1 + add r7, r2, r4, lsl #1 +161: + mov r8, r4 +16: + vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1 + vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1 + vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2 + subs r8, r8, #16 + vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1 + vsub.i16 q3, q3, q1 + vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1) + vabs.s16 q11, q3 + vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) + vqsub.u16 q11, q14, q11 + vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 + vshr.s16 q11, q11, #8 + vshl.s16 q12, q10, #9 // (64 - my1) << 9 + vshl.s16 q13, q11, #9 + vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 + vqdmulh.s16 q13, q13, q3 + vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 + vadd.i16 q13, q13, q1 + vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2 +.if \type == 444 + vmovn.u16 d20, q10 // 64 - my1 + vmovn.u16 d21, q11 + vsub.i8 q10, q15, q10 // my1 + vst1.8 {d20, d21}, [r6, :128]! +.elseif \type == 422 + vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) + vpadd.s16 d21, d22, d23 + vmovn.s16 d20, q10 + vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1 + vst1.8 {d20}, [r6, :64]! +.endif + vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 + vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2 + vsub.i16 q1, q1, q9 + vst1.16 {d24, d25}, [r0, :128]! // store dsty1 + vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2) + vabs.s16 q3, q1 + vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2) + vqsub.u16 q3, q14, q3 + vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8 + vshr.s16 q3, q3, #8 + vshl.s16 q12, q2, #9 // (64 - my2) << 9 + vshl.s16 q13, q3, #9 +.if \type == 444 + vmovn.u16 d4, q2 // 64 - my2 + vmovn.u16 d5, q3 + vsub.i8 q2, q15, q2 // my2 + vst1.8 {d4, d5}, [lr, :128]! +.elseif \type == 422 + vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition) + vpadd.s16 d5, d6, d7 + vmovn.s16 d4, q2 + vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1 + vst1.8 {d4}, [lr, :64]! +.elseif \type == 420 + vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition) + vadd.s16 q11, q11, q3 + vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) + vpadd.s16 d21, d22, d23 + vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.8 {d20}, [r6, :64]! +.endif + vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 + vqdmulh.s16 q13, q13, q1 + vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 + vadd.i16 q13, q13, q9 + vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 + vqrshrun.s16 d25, q13, #4 + vst1.16 {d24, d25}, [r12, :128]! // store dsty2 + bgt 16b + subs r5, r5, #2 + add r2, r2, r4, lsl #1 + add r3, r3, r4, lsl #1 + add r7, r7, r4, lsl #1 + add r9, r9, r4, lsl #1 +.if \type == 444 + add r6, r6, r4 + add lr, lr, r4 +.elseif \type == 422 + add r6, r6, r4, lsr #1 + add lr, lr, r4, lsr #1 +.endif + add r0, r0, r1 + add r12, r12, r1 + bgt 161b + pop {r4-r9,pc} +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_8bpc_neon, export=1 + push {r4-r5,lr} + ldrd r4, r5, [sp, #12] + clz lr, r3 + adr r3, L(blend_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_tbl): + .word 320f - L(blend_tbl) + CONFIG_THUMB + .word 160f - L(blend_tbl) + CONFIG_THUMB + .word 80f - L(blend_tbl) + CONFIG_THUMB + .word 40f - L(blend_tbl) + CONFIG_THUMB + +40: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld1.u8 {d2}, [r5, :64]! + vld1.u8 {d1}, [r2, :64]! + vld1.32 {d0[]}, [r0, :32] + subs r4, r4, #2 + vld1.32 {d0[1]}, [r12, :32] + vsub.i8 d3, d22, d2 + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d3 + vrshrn.i16 d20, q8, #6 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r5,pc} +80: + vmov.i8 d16, #64 + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld1.u8 {q1}, [r5, :128]! + vld1.u8 {q2}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vsub.i8 d17, d16, d2 + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vsub.i8 d18, d16, d3 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d17 + vmull.u8 q10, d3, d5 + vmlal.u8 q10, d1, d18 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.u8 {d22}, [r0, :64], r1 + vst1.u8 {d23}, [r12, :64], r1 + bgt 8b + pop {r4-r5,pc} +160: + vmov.i8 q12, #64 + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld1.u8 {q1, q2}, [r5, :128]! + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vsub.i8 q15, q12, q1 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d16, d2 + vmlal.u8 q3, d0, d30 + vmull.u8 q14, d17, d3 + vmlal.u8 q14, d1, d31 + vsub.i8 q15, q12, q2 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q14, #6 + vmull.u8 q3, d18, d4 + vmlal.u8 q3, d26, d30 + vmull.u8 q14, d19, d5 + vmlal.u8 q14, d27, d31 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q14, #6 + vst1.u8 {q10}, [r0, :128], r1 + vst1.u8 {q11}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: + vmov.i8 q10, #64 +32: + vld1.u8 {q2, q3}, [r5, :128]! + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0, q1}, [r0, :128] + subs r4, r4, #1 + vsub.i8 q11, q10, q2 + vmull.u8 q15, d16, d4 + vmlal.u8 q15, d0, d22 + vmull.u8 q14, d17, d5 + vmlal.u8 q14, d1, d23 + vsub.i8 q11, q10, q3 + vrshrn.i16 d24, q15, #6 + vrshrn.i16 d25, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d22 + vmull.u8 q14, d19, d7 + vmlal.u8 q14, d3, d23 + vrshrn.i16 d26, q15, #6 + vrshrn.i16 d27, q14, #6 + vst1.u8 {q12, q13}, [r0, :128], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + +function blend_h_8bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + movrel r5, X(obmc_masks) + add r5, r5, r4 + sub r4, r4, r4, lsr #2 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 + + .align 2 +L(blend_h_tbl): + .word 1280f - L(blend_h_tbl) + CONFIG_THUMB + .word 640f - L(blend_h_tbl) + CONFIG_THUMB + .word 320f - L(blend_h_tbl) + CONFIG_THUMB + .word 160f - L(blend_h_tbl) + CONFIG_THUMB + .word 80f - L(blend_h_tbl) + CONFIG_THUMB + .word 40f - L(blend_h_tbl) + CONFIG_THUMB + .word 20f - L(blend_h_tbl) + CONFIG_THUMB + +20: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +2: + vld1.16 {d2[], d3[]}, [r5, :16]! + vld1.32 {d1[]}, [r2, :32]! + subs r4, r4, #2 + vld1.16 {d0[]}, [r0, :16] + vzip.8 d2, d3 + vsub.i8 d4, d22, d2 + vld1.16 {d0[1]}, [r12, :16] + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d4 + vrshrn.i16 d20, q8, #6 + vst1.16 {d20[0]}, [r0, :16], r1 + vst1.16 {d20[1]}, [r12, :16], r1 + bgt 2b + pop {r4-r5,pc} +40: + vmov.i8 d22, #64 + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld2.u8 {d2[], d3[]}, [r5, :16]! + vld1.u8 {d1}, [r2, :64]! + subs r4, r4, #2 + vext.u8 d2, d2, d3, #4 + vld1.32 {d0[]}, [r0, :32] + vsub.i8 d6, d22, d2 + vld1.32 {d0[1]}, [r12, :32] + vmull.u8 q8, d1, d2 + vmlal.u8 q8, d0, d6 + vrshrn.i16 d20, q8, #6 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r12, :32], r1 + bgt 4b + pop {r4-r5,pc} +80: + vmov.i8 q8, #64 + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld2.u8 {d2[], d3[]}, [r5, :16]! + vld1.u8 {d4, d5}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vsub.i8 q9, q8, q1 + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d18 + vmull.u8 q10, d3, d5 + vmlal.u8 q10, d1, d19 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.u8 {d22}, [r0, :64], r1 + vst1.u8 {d23}, [r12, :64], r1 + bgt 8b + pop {r4-r5,pc} +160: + vmov.i8 q12, #64 + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld2.u8 {d28[], d29[]}, [r5, :16]! + vld1.u8 {d2, d3, d4, d5}, [r2, :128]! + vsub.i8 q15, q12, q14 + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d0, d30 + vmull.u8 q8, d3, d28 + vmlal.u8 q8, d1, d30 + vrshrn.i16 d18, q3, #6 + vrshrn.i16 d19, q8, #6 + vmull.u8 q3, d4, d29 + vmlal.u8 q3, d26, d31 + vmull.u8 q8, d5, d29 + vmlal.u8 q8, d27, d31 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q8, #6 + vst1.u8 {q9}, [r0, :128], r1 + vst1.u8 {q10}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: +640: +1280: + vmov.i8 d20, #64 + sub r1, r1, r3 +321: + vld1.u8 {d6[]}, [r5]! + vsub.i8 d7, d20, d6 + mov r12, r3 +32: + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {q0, q1}, [r0, :128] + vmull.u8 q15, d16, d6 + vmlal.u8 q15, d0, d7 + vmull.u8 q14, d17, d6 + vmlal.u8 q14, d1, d7 + vrshrn.i16 d0, q15, #6 + vrshrn.i16 d1, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d7 + vmull.u8 q14, d19, d6 + vmlal.u8 q14, d3, d7 + vrshrn.i16 d2, q15, #6 + vrshrn.i16 d3, q14, #6 + subs r12, r12, #32 + vst1.u8 {q0, q1}, [r0, :128]! + bgt 32b + add r0, r0, r1 + subs r4, r4, #1 + bgt 321b + pop {r4-r5,pc} +endfunc + +function blend_v_8bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 + adr r3, L(blend_v_tbl) + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 + bx r3 + + .align 2 +L(blend_v_tbl): + .word 320f - L(blend_v_tbl) + CONFIG_THUMB + .word 160f - L(blend_v_tbl) + CONFIG_THUMB + .word 80f - L(blend_v_tbl) + CONFIG_THUMB + .word 40f - L(blend_v_tbl) + CONFIG_THUMB + .word 20f - L(blend_v_tbl) + CONFIG_THUMB + +20: + vmov.i8 d22, #64 + vld1.8 {d2[]}, [lr] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d3, d22, d2 +2: + vld1.16 {d1[0]}, [r2, :16]! + vld1.8 {d0[]}, [r0] + subs r4, r4, #2 + vld1.8 {d1[1]}, [r2] + vld1.8 {d0[1]}, [r12] + vmull.u8 q2, d1, d2 + vmlal.u8 q2, d0, d3 + vrshrn.i16 d6, q2, #6 + add r2, r2, #2 + vst1.8 {d6[0]}, [r0], r1 + vst1.8 {d6[1]}, [r12], r1 + bgt 2b + pop {r4,pc} +40: + vmov.i8 d22, #64 + vld1.32 {d4[]}, [lr, :32] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d5, d22, d4 + sub r1, r1, #2 +4: + vld1.u8 {d2}, [r2, :64]! + vld1.32 {d0[]}, [r0, :32] + vld1.32 {d0[1]}, [r12, :32] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d5 + vrshrn.i16 d20, q3, #6 + vst1.16 {d20[0]}, [r0, :16]! + vst1.16 {d20[2]}, [r12, :16]! + vst1.8 {d20[2]}, [r0], r1 + vst1.8 {d20[6]}, [r12], r1 + bgt 4b + pop {r4,pc} +80: + vmov.i8 d16, #64 + vld1.u8 {d2}, [lr, :64] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 d17, d16, d2 + sub r1, r1, #4 +8: + vld1.u8 {d4, d5}, [r2, :128]! + vld1.u8 {d0}, [r0, :64] + vld1.u8 {d1}, [r12, :64] + subs r4, r4, #2 + vmull.u8 q3, d2, d4 + vmlal.u8 q3, d0, d17 + vmull.u8 q10, d2, d5 + vmlal.u8 q10, d1, d17 + vrshrn.i16 d22, q3, #6 + vrshrn.i16 d23, q10, #6 + vst1.32 {d22[0]}, [r0, :32]! + vst1.32 {d23[0]}, [r12, :32]! + vst1.16 {d22[2]}, [r0, :16], r1 + vst1.16 {d23[2]}, [r12, :16], r1 + bgt 8b + pop {r4,pc} +160: + vmov.i8 q12, #64 + vld1.u8 {q14}, [lr, :128] + add r12, r0, r1 + lsl r1, r1, #1 + vsub.i8 q11, q12, q14 + sub r1, r1, #8 +16: + vld1.u8 {q1, q2}, [r2, :128]! + vld1.u8 {q0}, [r0, :128] + subs r4, r4, #2 + vld1.u8 {q13}, [r12, :128] + vmull.u8 q3, d2, d28 + vmlal.u8 q3, d0, d22 + vmull.u8 q8, d3, d29 + vmlal.u8 q8, d1, d23 + vrshrn.i16 d18, q3, #6 + vrshrn.i16 d19, q8, #6 + vmull.u8 q3, d4, d28 + vmlal.u8 q3, d26, d22 + vmull.u8 q8, d5, d29 + vmlal.u8 q8, d27, d23 + vrshrn.i16 d20, q3, #6 + vrshrn.i16 d21, q8, #6 + vst1.u8 {d18}, [r0, :64]! + vst1.u8 {d20}, [r12, :64]! + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r12, :32], r1 + bgt 16b + pop {r4,pc} +320: + vmov.i8 q10, #64 + vld1.u8 {q2, q3}, [lr, :128] + vsub.i8 q11, q10, q2 + vsub.i8 d24, d20, d6 +32: + vld1.u8 {q8, q9}, [r2, :128]! + vld1.u8 {d0, d1, d2}, [r0, :64] + subs r4, r4, #1 + vmull.u8 q15, d16, d4 + vmlal.u8 q15, d0, d22 + vmull.u8 q14, d17, d5 + vmlal.u8 q14, d1, d23 + vrshrn.i16 d0, q15, #6 + vrshrn.i16 d1, q14, #6 + vmull.u8 q15, d18, d6 + vmlal.u8 q15, d2, d24 + vrshrn.i16 d2, q15, #6 + vst1.u8 {d0, d1, d2}, [r0, :64], r1 + bgt 32b + pop {r4,pc} +endfunc + + +// This has got the same signature as the put_8tap functions, +// assumes that the caller has loaded the h argument into r5, +// and assumes that r8 is set to (clz(w)-24). +function put_neon + adr r9, L(put_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 32f - L(put_tbl) + CONFIG_THUMB + .word 160f - L(put_tbl) + CONFIG_THUMB + .word 8f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.16 {d0[]}, [r2], r3 + vld1.16 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d1[0]}, [r0, :16], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + bgt 4b + pop {r4-r11,pc} +8: + vld1.8 {d0}, [r2], r3 + vld1.8 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.8 {d0}, [r0, :64], r1 + vst1.8 {d1}, [r0, :64], r1 + bgt 8b + pop {r4-r11,pc} +160: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +16: + vld1.8 {q0}, [r2], r3 + vld1.8 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.8 {q0}, [r0, :128], r1 + vst1.8 {q1}, [r8, :128], r1 + bgt 16b + pop {r4-r11,pc} +32: + vld1.8 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q0, q1}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #32 + sub r3, r3, #32 +64: + vld1.8 {q0, q1}, [r2]! + vst1.8 {q0, q1}, [r0, :128]! + vld1.8 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #96 + sub r3, r3, #96 +128: + vld1.8 {q8, q9}, [r2]! + vst1.8 {q8, q9}, [r0, :128]! + vld1.8 {q10, q11}, [r2]! + vst1.8 {q10, q11}, [r0, :128]! + vld1.8 {q12, q13}, [r2]! + vst1.8 {q12, q13}, [r0, :128]! + vld1.8 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.8 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + + +// This has got the same signature as the put_8tap functions, +// assumes that the caller has loaded the h argument into r4, +// and assumes that r8 is set to (clz(w)-24), and r7 to w*2. +function prep_neon + adr r9, L(prep_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 160f - L(prep_tbl) + CONFIG_THUMB + .word 8f - L(prep_tbl) + CONFIG_THUMB + .word 4f - L(prep_tbl) + CONFIG_THUMB + +4: + vld1.32 {d0[]}, [r1], r2 + vld1.32 {d2[]}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q0, d0, #4 + vshll.u8 q1, d2, #4 + vst1.16 {d1, d2}, [r0, :64]! + bgt 4b + pop {r4-r11,pc} +8: + vld1.8 {d0}, [r1], r2 + vld1.8 {d2}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q0, d0, #4 + vshll.u8 q1, d2, #4 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +160: + add r9, r1, r2 + lsl r2, r2, #1 + add r8, r0, r7 + lsl r7, r7, #1 +16: + vld1.8 {q2}, [r1], r2 + vld1.8 {q3}, [r9], r2 + subs r4, r4, #2 + vshll.u8 q0, d4, #4 + vshll.u8 q1, d5, #4 + vshll.u8 q2, d6, #4 + vshll.u8 q3, d7, #4 + vst1.16 {q0, q1}, [r0, :128], r7 + vst1.16 {q2, q3}, [r8, :128], r7 + bgt 16b + pop {r4-r11,pc} +320: + add r8, r0, r3 +32: + vld1.8 {q0, q1}, [r1], r2 + subs r4, r4, #2 + vshll.u8 q8, d0, #4 + vshll.u8 q9, d1, #4 + vld1.8 {q2, q3}, [r1], r2 + vshll.u8 q10, d2, #4 + vshll.u8 q11, d3, #4 + vshll.u8 q12, d4, #4 + vst1.16 {q8, q9}, [r0, :128], r7 + vshll.u8 q13, d5, #4 + vst1.16 {q10, q11}, [r8, :128], r7 + vshll.u8 q14, d6, #4 + vst1.16 {q12, q13}, [r0, :128], r7 + vshll.u8 q15, d7, #4 + vst1.16 {q14, q15}, [r8, :128], r7 + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #32 + add r8, r0, #32 + mov r6, #64 +64: + vld1.8 {q0, q1}, [r1]! + subs r4, r4, #1 + vshll.u8 q8, d0, #4 + vshll.u8 q9, d1, #4 + vld1.8 {q2, q3}, [r1], r2 + vshll.u8 q10, d2, #4 + vshll.u8 q11, d3, #4 + vshll.u8 q12, d4, #4 + vst1.16 {q8, q9}, [r0, :128], r6 + vshll.u8 q13, d5, #4 + vshll.u8 q14, d6, #4 + vst1.16 {q10, q11}, [r8, :128], r6 + vshll.u8 q15, d7, #4 + vst1.16 {q12, q13}, [r0, :128], r6 + vst1.16 {q14, q15}, [r8, :128], r6 + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #96 + add r8, r0, #32 + mov r6, #64 +128: + vld1.8 {q0, q1}, [r1]! + vld1.8 {q2, q3}, [r1]! + vshll.u8 q10, d0, #4 + vshll.u8 q11, d1, #4 + vshll.u8 q12, d2, #4 + vshll.u8 q13, d3, #4 + vshll.u8 q14, d4, #4 + vshll.u8 q15, d5, #4 + vld1.8 {q8, q9}, [r1]! + vst1.16 {q10, q11}, [r0, :128], r6 + vst1.16 {q12, q13}, [r8, :128], r6 + vshll.u8 q0, d6, #4 + vshll.u8 q1, d7, #4 + vshll.u8 q2, d16, #4 + vshll.u8 q3, d17, #4 + vshll.u8 q8, d18, #4 + vshll.u8 q9, d19, #4 + vld1.8 {q10, q11}, [r1], r2 + vst1.16 {q14, q15}, [r0, :128], r6 + vst1.16 {q0, q1}, [r8, :128], r6 + vshll.u8 q12, d20, #4 + vshll.u8 q13, d21, #4 + vshll.u8 q14, d22, #4 + vshll.u8 q15, d23, #4 + subs r4, r4, #1 + vst1.16 {q2, q3}, [r0, :128], r6 + vst1.16 {q8, q9}, [r8, :128], r6 + vst1.16 {q12, q13}, [r0, :128], r6 + vst1.16 {q14, q15}, [r8, :128], r6 + bgt 128b + pop {r4-r11,pc} +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.8 {\d0}, [\s0], \strd + vld1.8 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.8 {\d2}, [\s0], \strd + vld1.8 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.8 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.8 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.8 {\d6}, [\s0], \strd +.endif +.endm +.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro interleave_1_16 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #6 + vext.8 \r1, \r1, \r2, #6 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #6 + vext.8 \r3, \r3, \r4, #6 +.endif +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6 + vmovl.u8 \q0, \d0 + vmovl.u8 \q1, \d1 +.ifnb \q2 + vmovl.u8 \q2, \d2 + vmovl.u8 \q3, \d3 +.endif +.ifnb \q4 + vmovl.u8 \q4, \d4 +.endif +.ifnb \q5 + vmovl.u8 \q5, \d5 +.endif +.ifnb \q6 + vmovl.u8 \q6, \d6 +.endif +.endm +.macro mul_mla_4 d, s0, s1, s2, s3 + vmul.s16 \d, \s0, d0[0] + vmla.s16 \d, \s1, d0[1] + vmla.s16 \d, \s2, d0[2] + vmla.s16 \d, \s3, d0[3] +.endm +.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] +.endm +.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] + vmul.s16 \d1, \s1, d0[0] + vmla.s16 \d1, \s2, d0[1] + vmla.s16 \d1, \s3, d0[2] + vmla.s16 \d1, \s4, d0[3] + vmla.s16 \d1, \s5, d1[0] + vmla.s16 \d1, \s6, d1[1] + vmla.s16 \d1, \s7, d1[2] + vmla.s16 \d1, \s8, d1[3] +.endm +.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] + vmul.s16 \d1, \s2, d0[0] + vmla.s16 \d1, \s3, d0[1] + vmla.s16 \d1, \s4, d0[2] + vmla.s16 \d1, \s5, d0[3] + vmla.s16 \d1, \s6, d1[0] + vmla.s16 \d1, \s7, d1[1] + vmla.s16 \d1, \s8, d1[2] + vmla.s16 \d1, \s9, d1[3] +.endm +.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s16 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s16 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s16 \d2, \q2, #\shift + vqrshrun.s16 \d3, \q3, #\shift +.endif +.endm +.macro vrshr_s16 shift, r0, r1, r2, r3 + vrshr.s16 \r0, \r0, #\shift +.ifnb \r1 + vrshr.s16 \r1, \r1, #\shift +.endif +.ifnb \r2 + vrshr.s16 \r2, \r2, #\shift + vrshr.s16 \r3, \r3, #\shift +.endif +.endm +.macro st_16 strd, reg, lanes + vst1.16 {\reg[0]}, [r0, :16], \strd + vst1.16 {\reg[1]}, [r8, :16], \strd +.if \lanes > 2 + vst1.16 {\reg[2]}, [r0, :16], \strd + vst1.16 {\reg[3]}, [r8, :16], \strd +.endif +.endm +.macro st_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r8, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r8, :32], \strd +.endif +.endm +.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.8 {\r0}, [r0, \align], \strd + vst1.8 {\r1}, [r8, \align], \strd +.ifnb \r2 + vst1.8 {\r2}, [r0, \align], \strd + vst1.8 {\r3}, [r8, \align], \strd +.endif +.ifnb \r4 + vst1.8 {\r4}, [r0, \align], \strd + vst1.8 {\r5}, [r8, \align], \strd + vst1.8 {\r6}, [r0, \align], \strd + vst1.8 {\r7}, [r8, \align], \strd +.endif +.endm +.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3 +.ifc \type, put + vqrshrun_s16 6, \q0, \d0, \q1, \d2 + st_32 \strd, \d0, \d2 +.else + vrshr_s16 2, \q0, \q1 + st_reg \strd, :64, \d0, \d1, \d2, \d3 +.endif +.endm +.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3 +.ifc \type, put + vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + st_reg \strd, :64, \d0, \d1, \d2, \d3 +.else + vrshr_s16 2, \q0, \q1, \q2, \q3 + st_reg \strd, :128,\q0, \q1, \q2, \q3 +.endif +.endm +.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3 +.ifc \type, put + vqrshrun.s16 \d0, \q0, #6 + vqrshrun.s16 \d1, \q1, #6 + vqrshrun.s16 \d4, \q2, #6 + vqrshrun.s16 \d5, \q3, #6 + st_reg \strd, :128, \q0, \q2 +.else + vrshr_s16 2, \q0, \q1, \q2, \q3 + vst1.16 {\q0, \q1}, [r0, :128], \strd + vst1.16 {\q2, \q3}, [r8, :128], \strd +.endif +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + movw r8, \type_h + movw r9, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r10 + mul \my, \my, r10 + add \mx, \mx, r8 // mx, 8tap_h, 4tap_h + add \my, \my, r9 // my, 8tap_v, 4tap_v +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + clz r8, \w + tst \mx, #(0x7f << 14) + sub r8, r8, #24 + movrel r10, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r9, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r9 + tst \my, #(0x7f << 14) + add \mx, r10, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r9, L(\type\()_8tap_h_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.8 {d4}, [\src], \s_strd + vld1.8 {d6}, [\sr2], \s_strd + vmovl.u8 q2, d4 + vmovl.u8 q3, d6 + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmul.s16 d2, d4, d0[0] + vmla.s16 d2, d5, d0[1] + vmla.s16 d2, d6, d0[2] + vmla.s16 d2, d7, d0[3] + vrshr.s16 d2, d2, #2 + vqrshrun.s16 d2, q1, #4 + vst1.16 {d2[0]}, [\dst, :16], \d_strd + vst1.16 {d2[1]}, [\ds2, :16], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.8 {d16}, [\src], \s_strd + vld1.8 {d24}, [\sr2], \s_strd + vmovl.u8 q8, d16 + vmovl.u8 q12, d24 + vext.8 d18, d16, d17, #2 + vext.8 d20, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d26, d24, d25, #2 + vext.8 d28, d24, d25, #4 + vext.8 d30, d24, d25, #6 + subs \h, \h, #2 + vmul.s16 d4, d16, d0[0] + vmla.s16 d4, d18, d0[1] + vmla.s16 d4, d20, d0[2] + vmla.s16 d4, d22, d0[3] + vmul.s16 d5, d24, d0[0] + vmla.s16 d5, d26, d0[1] + vmla.s16 d5, d28, d0[2] + vmla.s16 d5, d30, d0[3] + vrshr.s16 q2, q2, #2 +.ifc \type, put + vqrshrun.s16 d4, q2, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd +.endif + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +8: + vld1.8 {q8}, [\src], \s_strd + vld1.8 {q12}, [\sr2], \s_strd + vmovl.u8 q9, d17 + vmovl.u8 q8, d16 + vmovl.u8 q13, d25 + vmovl.u8 q12, d24 + + vmul.s16 q10, q8, d0[0] + vmul.s16 q14, q12, d0[0] +.irpc i, 1234567 + vext.8 q11, q8, q9, #(2*\i) + vext.8 q15, q12, q13, #(2*\i) +.if \i < 4 + vmla.s16 q10, q11, d0[\i] + vmla.s16 q14, q15, d0[\i] +.else + vmla.s16 q10, q11, d1[\i-4] + vmla.s16 q14, q15, d1[\i-4] +.endif +.endr + subs \h, \h, #2 + vrshr.s16 q10, q10, #2 + vrshr.s16 q14, q14, #2 +.ifc \type, put + vqrshrun.s16 d20, q10, #4 + vqrshrun.s16 d28, q14, #4 + vst1.8 {d20}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q10}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + bgt 8b + pop {r4-r11,pc} + +160: +320: +640: +1280: // 16xN, 32xN, ... h + // This could be done without touching q4-q6, by using only + // one temporary for vext in the loop. That's slower on A7 and A53, + // (but surprisingly, marginally faster on A8 and A73). + vpush {q4-q6} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w +.endif +161: + vld1.8 {d16, d17, d18}, [\src]! + vld1.8 {d24, d25, d26}, [\sr2]! + mov \mx, \w + vmovl.u8 q10, d18 + vmovl.u8 q9, d17 + vmovl.u8 q8, d16 + vmovl.u8 q14, d26 + vmovl.u8 q13, d25 + vmovl.u8 q12, d24 + +16: + vmul.s16 q1, q8, d0[0] + vmul.s16 q2, q9, d0[0] + vmul.s16 q3, q12, d0[0] + vmul.s16 q4, q13, d0[0] +.irpc i, 1234567 + vext.8 q5, q8, q9, #(2*\i) + vext.8 q6, q9, q10, #(2*\i) + vext.8 q11, q12, q13, #(2*\i) + vext.8 q15, q13, q14, #(2*\i) +.if \i < 4 + vmla.s16 q1, q5, d0[\i] + vmla.s16 q2, q6, d0[\i] + vmla.s16 q3, q11, d0[\i] + vmla.s16 q4, q15, d0[\i] +.else + vmla.s16 q1, q5, d1[\i-4] + vmla.s16 q2, q6, d1[\i-4] + vmla.s16 q3, q11, d1[\i-4] + vmla.s16 q4, q15, d1[\i-4] +.endif +.endr + vrshr.s16 q1, q1, #2 + vrshr.s16 q2, q2, #2 + vrshr.s16 q3, q3, #2 + vrshr.s16 q4, q4, #2 + subs \mx, \mx, #16 +.ifc \type, put + vqrshrun.s16 d2, q1, #4 + vqrshrun.s16 d3, q2, #4 + vqrshrun.s16 d4, q3, #4 + vqrshrun.s16 d5, q4, #4 + vst1.8 {q1}, [\dst, :128]! + vst1.8 {q2}, [\ds2, :128]! +.else + vst1.16 {q1, q2}, [\dst, :128]! + vst1.16 {q3, q4}, [\ds2, :128]! +.endif + ble 9f + + vmov q8, q10 + vmov q12, q14 + vld1.8 {d18, d19}, [\src]! + vld1.8 {d26, d27}, [\sr2]! + vmovl.u8 q10, d19 + vmovl.u8 q9, d18 + vmovl.u8 q14, d27 + vmovl.u8 q13, d26 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q6} + pop {r4-r11,pc} + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r9, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r9 + add \my, r10, \my, lsl #3 + + adr r9, L(\type\()_8tap_v_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_16 d1, d2, d3, d4, d5 + bgt 24f + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 + mul_mla_4 d6, d16, d18, d20, d22 + vqrshrun_s16 6, q3, d6 + st_16 \d_strd, d6, 2 + pop {r4-r11,pc} + +24: // 2x4 v + load_16 \sr2, \src, \s_strd, d6, d7 + interleave_1_16 d5, d6, d7 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6 + vmov d17, d20 + vmov d19, d22 + vmov d21, d24 + vmov d23, d26 + mul_mla_4 q3, q8, q9, q10, q11 + vqrshrun_s16 6, q3, d6 + st_16 \d_strd, d6, 4 + pop {r4-r11,pc} + +28: // 2x6, 2x8, 2x12, 2x16 v + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14 + interleave_1_16 d2, d4, d6, d8, d10 + interleave_1_16 d10, d12, d14 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12 + vmov d3, d6 + vmov d5, d8 + vmov d7, d10 + vmov d9, d12 +216: + subs \h, \h, #4 + load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 + interleave_1_16 d14, d16, d18, d20, d22 + vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 + vmov d11, d14 + vmov d13, d16 + vmov d15, d18 + vmov d17, d20 + mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8 + vqrshrun_s16 6, q1, d2 + st_16 \d_strd, d2, 4 + ble 0f + cmp \h, #2 + vmov q1, q5 + vmov q2, q6 + vmov q3, q7 + vmov q4, q8 + vmov q5, q9 + vmov q6, q10 + vmov d14, d22 + beq 26f + b 216b +26: + load_16 \sr2, \src, \s_strd, d16, d18 + interleave_1_16 d14, d16, d18 + vmovl_u8 q7, d14, q8, d16 + vmov d11, d14 + vmov d13, d16 + mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16 + vqrshrun_s16 6, q1, d2 + st_16 \d_strd, d2, 2 +0: + vpop {q4-q7} + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 + mul_mla_4 q3, q8, q9, q10, q11 + shift_store_4 \type, \d_strd, q3, d6, d7 + ble 0f + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmovl_u8 q12, d5, q13, d6 + mul_mla_4 q3, q10, q11, q12, q13 + shift_store_4 \type, \d_strd, q3, d6, d7 +0: + pop {r4-r11,pc} + +480: // 4x6, 4x8, 4x12, 4x16 v + vpush {q4} + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 + interleave_1_32 d2, d4, d6 + interleave_1_32 d6, d8, d16, d18, d20 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18 + +48: + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d22, d24, d26, d28 + interleave_1_32 d20, d22, d24, d26, d28 + vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26 + mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 + shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 + ble 0f + load_32 \sr2, \src, \s_strd, d30, d2 + subs \h, \h, #2 + interleave_1_32 d28, d30, d2 + vmovl_u8 q14, d28, q15, d30 + mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15 + shift_store_4 \type, \d_strd, q8, d16, d17 + ble 0f + load_32 \sr2, \src, \s_strd, d4, d6 + subs \h, \h, #2 + interleave_1_32 d2, d4, d6 + vmovl_u8 q1, d2, q2, d4 + mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2 + shift_store_4 \type, \d_strd, q9, d18, d19 + ble 0f + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 + interleave_1_32 d6, d8, d16, d18, d20 + vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 + mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 + shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 + bgt 48b +0: + vpop {q4} + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5 + mul_mla_4 q1, q8, q9, q10, q11 + mul_mla_4 q2, q9, q10, q11, q12 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmovl_u8 q13, d6, q14, d7 + mul_mla_4 q1, q10, q11, q12, q13 + mul_mla_4 q2, q11, q12, q13, q14 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 + vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d22, d24 + vmovl_u8 q11, d22, q12, d24 + mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12 + shift_store_8 \type, \d_strd, q1, d2, q2, d4 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d26, d28 + vmovl_u8 q13, d26, q14, d28 + mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 + shift_store_8 \type, \d_strd, q3, d6, q4, d8 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d30, d2 + vmovl_u8 q15, d30, q1, d2 + mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 + shift_store_8 \type, \d_strd, q8, d16, q9, d18 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, d4, d6 + vmovl_u8 q2, d4, q3, d6 + mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 + shift_store_8 \type, \d_strd, q10, d20, q11, d22 + ble 9f + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 + vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20 + mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8 + mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10 + shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30 + bgt 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + vpop {q4} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + cmp \h, #2 + load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15 + vmovl.u8 q1, d22 + vmovl.u8 q2, d24 + vmovl.u8 q3, d26 + vmovl.u8 q8, d28 + vmovl.u8 q9, d30 + vmovl.u8 q11, d23 + vmovl.u8 q12, d25 + vmovl.u8 q13, d27 + vmovl.u8 q14, d29 + vmovl.u8 q15, d31 + mul_mla_4 q1, q1, q2, q3, q8 + mul_mla_4 q10, q2, q3, q8, q9 + mul_mla_4 q2, q11, q12, q13, q14 + mul_mla_4 q11, q12, q13, q14, q15 + shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmovl.u8 q1, d20 + vmovl.u8 q10, d21 + vmovl.u8 q12, d22 + vmovl.u8 q11, d23 + mul_mla_4 q2, q3, q8, q9, q1 + mul_mla_4 q3, q13, q14, q15, q10 + mul_mla_4 q13, q8, q9, q1, q12 + mul_mla_4 q14, q14, q15, q10, q11 + shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14 +0: + pop {r4-r11,pc} + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r9, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r9 + add \my, r10, \my, lsl #3 + + adr r9, L(\type\()_8tap_hv_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + + vld1.8 {d26}, [\src], \s_strd + vmovl.u8 q13, d26 + vext.8 q14, q13, q13, #2 + vmul.s16 d26, d26, d0 + vmul.s16 d28, d28, d0 + vpadd.s16 d26, d26, d28 + vpadd.s16 d26, d26, d26 + vrshr.s16 d16, d26, #2 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vmov d17, d26 + vext.8 d16, d16, d26, #4 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d26, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d26, d2[3] + + vqrshrn.s32 d4, q2, #\shift_hv + vqmovun.s16 d4, q2 + subs \h, \h, #2 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d26 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.8 {d26}, [\src], \s_strd + vmovl.u8 q13, d26 + vext.8 q14, q13, q13, #2 + vmul.s16 d26, d26, d0 + vmul.s16 d28, d28, d0 + vpadd.s16 d26, d26, d28 + vpadd.s16 d26, d26, d26 + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_2) + vext.8 d16, d16, d16, #4 + vmov d17, d26 + vext.8 d16, d16, d26, #4 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d26, #4 + vmov d19, d26 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d26, #4 + vmov d21, d26 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d26, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d20, d3[0] + vmlal.s16 q2, d21, d3[1] + vmlal.s16 q2, d22, d3[2] + vmlal.s16 q2, d26, d3[3] + + vqrshrn.s32 d4, q2, #\shift_hv + vqmovun.s16 d4, q2 + subs \h, \h, #2 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d19 + vmov d18, d20 + vmov d19, d21 + vmov d20, d22 + vmov d21, d26 + b 28b + +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.8 {d28}, [\sr2], \s_strd + vld1.8 {d30}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vmovl.u8 q13, d28 + vmovl.u8 q14, d29 + vmov d27, d28 + vmovl.u8 q14, d30 + vmovl.u8 q15, d31 + vtrn.32 d26, d28 + vtrn.32 d27, d30 + vmul.s16 d26, d26, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d28, d0[2] + vmla.s16 d26, d30, d0[3] + vrshr.s16 d26, d26, #2 + vext.8 d27, d26, d26, #4 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.8 {d30}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d31, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d31, d0[3] + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_4) + vmov d17, d26 + vmov d18, d27 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d26, d2[3] + vmull.s16 q3, d17, d2[0] + vmlal.s16 q3, d18, d2[1] + vmlal.s16 q3, d26, d2[2] + vmlal.s16 q3, d27, d2[3] + vqrshrn.s32 d4, q2, #\shift_hv + vqrshrn.s32 d6, q3, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d4, q2 + vqmovun.s16 d6, q3 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d6[0]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d6}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + vmov d17, d26 + vmov d18, d27 + b 4b + +480: // 4x8, 4x16, 4x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.8 {d30}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d31, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d31, d0[3] + vrshr.s16 d16, d26, #2 + + bl L(\type\()_8tap_filter_4) + vmov d17, d26 + vmov d18, d27 + bl L(\type\()_8tap_filter_4) + vmov d19, d26 + vmov d20, d27 + bl L(\type\()_8tap_filter_4) + vmov d21, d26 + vmov d22, d27 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d20, d3[0] + vmlal.s16 q2, d21, d3[1] + vmlal.s16 q2, d22, d3[2] + vmlal.s16 q2, d26, d3[3] + vmull.s16 q3, d17, d2[0] + vmlal.s16 q3, d18, d2[1] + vmlal.s16 q3, d19, d2[2] + vmlal.s16 q3, d20, d2[3] + vmlal.s16 q3, d21, d3[0] + vmlal.s16 q3, d22, d3[1] + vmlal.s16 q3, d26, d3[2] + vmlal.s16 q3, d27, d3[3] + vqrshrn.s32 d4, q2, #\shift_hv + vqrshrn.s32 d6, q3, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d4, q2 + vqmovun.s16 d6, q3 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d6[0]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d6}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + vmov d17, d19 + vmov d18, d20 + vmov d19, d21 + vmov d20, d22 + vmov d21, d26 + vmov d22, d27 + b 48b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.8 {d30}, [\sr2], \s_strd + vld1.8 {d31}, [\src], \s_strd + vmovl.u8 q14, d30 + vext.8 d27, d28, d29, #2 + vext.8 d30, d28, d29, #4 + vext.8 d1, d28, d29, #6 + vmul.s16 d26, d28, d0[0] + vmla.s16 d26, d27, d0[1] + vmla.s16 d26, d30, d0[2] + vmla.s16 d26, d1, d0[3] + + vmovl.u8 q14, d31 + vext.8 d30, d28, d29, #2 + vext.8 d31, d28, d29, #4 + vext.8 d1, d28, d29, #6 + vmul.s16 d27, d28, d0[0] + vmla.s16 d27, d30, d0[1] + vmla.s16 d27, d31, d0[2] + vmla.s16 d27, d1, d0[3] + vrshr.s16 d26, d26, #2 + vrshr.s16 d27, d27, #2 + bx lr + +80: +160: +320: + bgt 880f + vpush {q4-q7} + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #3 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.8 {q14}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vrshr.s16 q3, q10, #2 + + bl L(\type\()_8tap_filter_8) + vmov q4, q10 + vmov q5, q11 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q12, d6, d2[0] + vmull.s16 q13, d7, d2[0] + vmull.s16 q14, d8, d2[0] + vmull.s16 q15, d9, d2[0] + vmlal.s16 q12, d8, d2[1] + vmlal.s16 q13, d9, d2[1] + vmlal.s16 q14, d10, d2[1] + vmlal.s16 q15, d11, d2[1] + vmlal.s16 q12, d10, d2[2] + vmlal.s16 q13, d11, d2[2] + vmlal.s16 q14, d20, d2[2] + vmlal.s16 q15, d21, d2[2] + vmlal.s16 q12, d20, d2[3] + vmlal.s16 q13, d21, d2[3] + vmlal.s16 q14, d22, d2[3] + vmlal.s16 q15, d23, d2[3] + vqrshrn.s32 d24, q12, #\shift_hv + vqrshrn.s32 d25, q13, #\shift_hv + vqrshrn.s32 d28, q14, #\shift_hv + vqrshrn.s32 d29, q15, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d24, q12 + vqmovun.s16 d28, q14 + vst1.8 {d24}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q12}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q3, q5 + vmov q4, q10 + vmov q5, q11 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #3 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.8 {q14}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vrshr.s16 q3, q10, #2 + + bl L(\type\()_8tap_filter_8) + vmov q4, q10 + vmov q5, q11 + bl L(\type\()_8tap_filter_8) + vmov q6, q10 + vmov q7, q11 + bl L(\type\()_8tap_filter_8) + vmov q8, q10 + vmov q9, q11 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q12, d6, d2[0] + vmull.s16 q13, d7, d2[0] + vmull.s16 q14, d8, d2[0] + vmull.s16 q15, d9, d2[0] + vmlal.s16 q12, d8, d2[1] + vmlal.s16 q13, d9, d2[1] + vmlal.s16 q14, d10, d2[1] + vmlal.s16 q15, d11, d2[1] + vmlal.s16 q12, d10, d2[2] + vmlal.s16 q13, d11, d2[2] + vmlal.s16 q14, d12, d2[2] + vmlal.s16 q15, d13, d2[2] + vmlal.s16 q12, d12, d2[3] + vmlal.s16 q13, d13, d2[3] + vmlal.s16 q14, d14, d2[3] + vmlal.s16 q15, d15, d2[3] + vmlal.s16 q12, d14, d3[0] + vmlal.s16 q13, d15, d3[0] + vmlal.s16 q14, d16, d3[0] + vmlal.s16 q15, d17, d3[0] + vmlal.s16 q12, d16, d3[1] + vmlal.s16 q13, d17, d3[1] + vmlal.s16 q14, d18, d3[1] + vmlal.s16 q15, d19, d3[1] + vmlal.s16 q12, d18, d3[2] + vmlal.s16 q13, d19, d3[2] + vmlal.s16 q14, d20, d3[2] + vmlal.s16 q15, d21, d3[2] + vmlal.s16 q12, d20, d3[3] + vmlal.s16 q13, d21, d3[3] + vmlal.s16 q14, d22, d3[3] + vmlal.s16 q15, d23, d3[3] + vqrshrn.s32 d24, q12, #\shift_hv + vqrshrn.s32 d25, q13, #\shift_hv + vqrshrn.s32 d28, q14, #\shift_hv + vqrshrn.s32 d29, q15, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + vqmovun.s16 d24, q12 + vqmovun.s16 d28, q14 + vst1.8 {d24}, [\dst, :64], \d_strd + vst1.8 {d28}, [\ds2, :64], \d_strd +.else + vst1.16 {q12}, [\dst, :128], \d_strd + vst1.16 {q14}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q3, q5 + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.8 {q14}, [\sr2], \s_strd + vld1.8 {q15}, [\src], \s_strd + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + vmul.s16 q10, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q10, q14, d1[\i-4] +.endr + vmovl.u8 q12, d30 + vmovl.u8 q13, d31 + vmul.s16 q11, q12, d0[0] +.irpc i, 123 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q11, q14, d0[\i] +.endr +.irpc i, 4567 + vext.8 q14, q12, q13, #(2*\i) + vmla.s16 q11, q14, d1[\i-4] +.endr + vrshr.s16 q10, q10, #2 + vrshr.s16 q11, q11, #2 + bx lr +endfunc + + +function \type\()_bilin_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + vdup.8 d1, \mx + vdup.8 d3, \my + rsb r8, \mx, #16 + rsb r9, \my, #16 + vdup.8 d0, r8 + vdup.8 d2, r9 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz r8, \w + cmp \mx, #0 + sub r8, r8, #24 + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r9, L(\type\()_bilin_h_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.32 {d4[]}, [\src], \s_strd + vld1.32 {d6[]}, [\sr2], \s_strd + vext.8 d5, d4, d4, #1 + vext.8 d7, d6, d6, #1 + vtrn.16 q2, q3 + subs \h, \h, #2 + vmull.u8 q3, d4, d0 + vmlal.u8 q3, d5, d1 + vqrshrn.u16 d4, q3, #4 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.8 {d4}, [\src], \s_strd + vld1.8 {d6}, [\sr2], \s_strd + vext.8 d5, d4, d4, #1 + vext.8 d7, d6, d6, #1 + vtrn.32 q2, q3 + subs \h, \h, #2 + vmull.u8 q3, d4, d0 + vmlal.u8 q3, d5, d1 +.ifc \type, put + vqrshrn.u16 d4, q3, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d6}, [\dst, :64], \d_strd + vst1.16 {d7}, [\ds2, :64], \d_strd +.endif + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.8 {q8}, [\src], \s_strd + vld1.8 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #1 + vext.8 q11, q10, q10, #1 + subs \h, \h, #2 + vmull.u8 q8, d16, d0 + vmull.u8 q10, d20, d0 + vmlal.u8 q8, d18, d1 + vmlal.u8 q10, d22, d1 +.ifc \type, put + vqrshrn.u16 d16, q8, #4 + vqrshrn.u16 d18, q10, #4 + vst1.8 {d16}, [\dst, :64], \d_strd + vst1.8 {d18}, [\ds2, :64], \d_strd +.else + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd +.endif + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w +.endif +161: + vld1.8 {d16}, [\src]! + vld1.8 {d22}, [\sr2]! + mov \mx, \w + +16: + vld1.8 {d17,d18}, [\src]! + vld1.8 {d23,d24}, [\sr2]! + vext.8 q10, q8, q9, #1 + vext.8 q13, q11, q12, #1 + vmull.u8 q2, d16, d0 + vmull.u8 q3, d17, d0 + vmull.u8 q14, d22, d0 + vmull.u8 q15, d23, d0 + vmlal.u8 q2, d20, d1 + vmlal.u8 q3, d21, d1 + vmlal.u8 q14, d26, d1 + vmlal.u8 q15, d27, d1 + subs \mx, \mx, #16 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vqrshrn.u16 d5, q3, #4 + vqrshrn.u16 d28, q14, #4 + vqrshrn.u16 d29, q15, #4 + vst1.8 {q2}, [\dst, :128]! + vst1.8 {q14}, [\ds2, :128]! +.else + vst1.16 {q2, q3}, [\dst, :128]! + vst1.16 {q14, q15}, [\ds2, :128]! +.endif + ble 9f + + vmov d16, d18 + vmov d22, d24 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + pop {r4-r11,pc} + +L(\type\()_bilin_v): + cmp \h, #4 + adr r9, L(\type\()_bilin_v_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.16 {d16[]}, [\src], \s_strd + bgt 24f +22: + vld1.16 {d17[]}, [\sr2], \s_strd + vld1.16 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #6 + vext.8 d17, d17, d18, #6 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + vqrshrn.u16 d4, q2, #4 + vst1.16 {d4[0]}, [\dst, :16] + vst1.16 {d4[1]}, [\ds2, :16] + pop {r4-r11,pc} +24: // 2x4, 2x6, 2x8, ... v + vld1.16 {d17[]}, [\sr2], \s_strd + vld1.16 {d18[]}, [\src], \s_strd + vld1.16 {d19[]}, [\sr2], \s_strd + vld1.16 {d20[]}, [\src], \s_strd + sub \h, \h, #4 + vext.8 d16, d16, d17, #6 + vext.8 d17, d17, d18, #6 + vext.8 d18, d18, d19, #6 + vext.8 d19, d19, d20, #6 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + cmp \h, #2 + vqrshrn.u16 d4, q2, #4 + vst1.16 {d4[0]}, [\dst, :16], \d_strd + vst1.16 {d4[1]}, [\ds2, :16], \d_strd + vst1.16 {d4[2]}, [\dst, :16], \d_strd + vst1.16 {d4[3]}, [\ds2, :16], \d_strd + blt 0f + vmov d16, d20 + beq 22b + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.32 {d16[]}, [\src], \s_strd +4: + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmull.u8 q2, d16, d2 + vmlal.u8 q2, d17, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd +.else + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d18 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.8 {d16}, [\src], \s_strd +8: + vld1.8 {d17}, [\sr2], \s_strd + vld1.8 {d18}, [\src], \s_strd + vmull.u8 q2, d16, d2 + vmull.u8 q3, d17, d2 + vmlal.u8 q2, d17, d3 + vmlal.u8 q3, d18, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d4, q2, #4 + vqrshrn.u16 d6, q3, #4 + vst1.8 {d4}, [\dst, :64], \d_strd + vst1.8 {d6}, [\ds2, :64], \d_strd +.else + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd +.endif + ble 0f + vmov d16, d18 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {q8}, [\src], \s_strd +2: + vld1.8 {q9}, [\sr2], \s_strd + vld1.8 {q10}, [\src], \s_strd + vmull.u8 q12, d16, d2 + vmull.u8 q13, d17, d2 + vmull.u8 q14, d18, d2 + vmull.u8 q15, d19, d2 + vmlal.u8 q12, d18, d3 + vmlal.u8 q13, d19, d3 + vmlal.u8 q14, d20, d3 + vmlal.u8 q15, d21, d3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d24, q12, #4 + vqrshrn.u16 d25, q13, #4 + vqrshrn.u16 d28, q14, #4 + vqrshrn.u16 d29, q15, #4 + vst1.8 {q12}, [\dst, :128], \d_strd + vst1.8 {q14}, [\ds2, :128], \d_strd +.else + vst1.16 {q12, q13}, [\dst, :128], \d_strd + vst1.16 {q14, q15}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 +.ifc \type, put + add \dst, \dst, #16 +.else + add \dst, \dst, #32 +.endif + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + vmovl.u8 q2, d2 + vmovl.u8 q3, d3 + adr r9, L(\type\()_bilin_hv_tbl) + ldr r8, [r9, r8, lsl #2] + add r9, r9, r8 + bx r9 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.32 {d28[]}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vmull.u8 q8, d28, d0 + vmlal.u8 q8, d29, d1 + +2: + vld1.32 {d28[]}, [\sr2], \s_strd + vld1.32 {d30[]}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vtrn.16 d28, d30 + vtrn.16 d29, d31 + vmull.u8 q9, d28, d0 + vmlal.u8 q9, d29, d1 + + vtrn.32 d16, d18 + + vmul.u16 d20, d16, d4 + vmla.u16 d20, d19, d6 + vqrshrn.u16 d20, q10, #8 + subs \h, \h, #2 + vst1.16 {d20[0]}, [\dst, :16], \d_strd + vst1.16 {d20[1]}, [\ds2, :16], \d_strd + ble 0f + vtrn.32 d19, d16 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {d28}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vmull.u8 q8, d28, d0 + vmlal.u8 q8, d29, d1 + +4: + vld1.8 {d28}, [\sr2], \s_strd + vld1.8 {d30}, [\src], \s_strd + vext.8 d29, d28, d28, #1 + vext.8 d31, d30, d30, #1 + vtrn.32 d28, d30 + vtrn.32 d29, d31 + vmull.u8 q9, d28, d0 + vmlal.u8 q9, d29, d1 + + vmov d17, d18 + + vmul.u16 q10, q8, q2 + vmla.u16 q10, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d20, q10, #8 + vst1.32 {d20[0]}, [\dst, :32], \d_strd + vst1.32 {d20[1]}, [\ds2, :32], \d_strd +.else + vrshr.u16 q10, q10, #4 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd +.endif + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.8 {q12}, [\src], \s_strd + vext.8 q13, q12, q12, #1 + vmull.u8 q8, d24, d0 + vmlal.u8 q8, d26, d1 + +2: + vld1.8 {q12}, [\sr2], \s_strd + vld1.8 {q14}, [\src], \s_strd + vext.8 q13, q12, q12, #1 + vext.8 q15, q14, q14, #1 + vmull.u8 q9, d24, d0 + vmlal.u8 q9, d26, d1 + vmull.u8 q10, d28, d0 + vmlal.u8 q10, d30, d1 + + vmul.u16 q8, q8, q2 + vmla.u16 q8, q9, q3 + vmul.u16 q9, q9, q2 + vmla.u16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vqrshrn.u16 d16, q8, #8 + vqrshrn.u16 d18, q9, #8 + vst1.8 {d16}, [\dst, :64], \d_strd + vst1.8 {d18}, [\ds2, :64], \d_strd +.else + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd +.endif + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10 +filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 + +.macro load_filter_ptr src + asr r12, \src, #10 + add r12, r11, r12, lsl #3 +.endm + +.macro load_filter_coef dst, src, inc + add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] +.endm + +.macro load_filter_row dst, src, inc + load_filter_ptr \src + load_filter_coef \dst, \src, \inc +.endm + +function warp_filter_horz_neon + load_filter_ptr r5 // filter 0 + vld1.16 {q7}, [r2], r3 + vmov.i8 q6, #128 + + load_filter_coef d0, r5, r7 // filter 0 + load_filter_row d1, r5, r7 // filter 1 + load_filter_row d2, r5, r7 // filter 2 + load_filter_ptr r5 // filter 3 + veor q7, q7, q6 // subtract by 128 to allow using vmull + load_filter_coef d3, r5, r7 // filter 3 + vext.8 d12, d14, d15, #1 // filter 1 pixels + vext.8 d13, d14, d15, #2 // filter 2 pixels + load_filter_ptr r5 // filter 4 + vmull.s8 q2, d14, d0 // filter 0 output + vmull.s8 q3, d12, d1 // filter 1 output + load_filter_coef d0, r5, r7 // filter 4 + load_filter_ptr r5 // filter 5 + vext.8 d12, d14, d15, #3 // filter 3 pixels + vmull.s8 q4, d13, d2 // filter 2 output + vext.8 d13, d14, d15, #4 // filter 4 pixels + vpadd.i16 d4, d4, d5 // pixel 0 (4x16) + vpadd.i16 d5, d6, d7 // pixel 1 (4x16) + load_filter_coef d1, r5, r7 // filter 5 + load_filter_ptr r5 // filter 6 + vmull.s8 q5, d12, d3 // filter 3 output + vext.8 d12, d14, d15, #5 // filter 5 pixels + vmull.s8 q3, d13, d0 // filter 4 output + load_filter_coef d0, r5, r7 // filter 6 + vext.8 d13, d14, d15, #6 // filter 6 pixels + load_filter_ptr r5 // filter 7 + vpadd.i16 d8, d8, d9 // pixel 2 (4x16) + vpadd.i16 d9, d10, d11 // pixel 3 (4x16) + vmull.s8 q5, d12, d1 // filter 5 output + load_filter_coef d1, r5, r7 // filter 7 + vext.8 d14, d14, d15, #7 // filter 7 pixels + vpadd.i16 d6, d6, d7 // pixel 4 (4x16) + vpadd.i16 d10, d10, d11 // pixel 5 (4x16) + vmull.s8 q6, d13, d0 // filter 6 output + vmull.s8 q7, d14, d1 // filter 7 output + + sub r5, r5, r7, lsl #3 + + vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16) + vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16) + vpadd.i16 d12, d12, d13 // pixel 6 (4x16) + vpadd.i16 d14, d14, d15 // pixel 7 (4x16) + vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16) + vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16) + vpadd.i16 d4, d4, d5 // pixel 0-3 + vpadd.i16 d5, d6, d10 // pixel 4-7 + + add r5, r5, r8 + + bx lr +endfunc + +// void dav1d_warp_affine_8x8_8bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my) +.macro warp t, shift +function warp_affine_8x8\t\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + ldrd r8, r9, [r4] + sxth r7, r8 + asr r8, r8, #16 + asr r4, r9, #16 + sxth r9, r9 + mov r10, #8 + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + sub r2, r2, #3 + movrel r11, X(mc_warp_filter), 64*8 +.ifnb \t + lsl r1, r1, #1 +.endif + add r5, r5, #512 + add r6, r6, #512 + + bl warp_filter_horz_neon + vrshr.s16 q8, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q9, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q10, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q11, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q12, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q13, q2, #3 + bl warp_filter_horz_neon + vrshr.s16 q14, q2, #3 + +1: + bl warp_filter_horz_neon + vrshr.s16 q15, q2, #3 + + load_filter_row d8, r6, r9 + load_filter_row d9, r6, r9 + load_filter_row d10, r6, r9 + load_filter_row d11, r6, r9 + load_filter_row d12, r6, r9 + load_filter_row d13, r6, r9 + load_filter_row d14, r6, r9 + load_filter_row d15, r6, r9 + transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 + vmovl.s8 q1, d8 + vmovl.s8 q2, d9 + vmovl.s8 q3, d10 + vmovl.s8 q4, d11 + vmovl.s8 q5, d12 + vmovl.s8 q6, d13 + + sub r6, r6, r9, lsl #3 + + // This ordering of vmull/vmlal is highly beneficial for + // Cortex A8/A9/A53 here, but harmful for Cortex A7. + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 + + vmovl.s8 q2, d14 + vmovl.s8 q3, d15 + + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 + +.ifb \t + vmov.i16 q7, #128 +.else + vmov.i16 q7, #0x800 +.endif + + vmov q8, q9 + vmov q9, q10 + vqrshrn.s32 d0, q0, #\shift + vmov q10, q11 + vqrshrn.s32 d1, q1, #\shift + vmov q11, q12 + vadd.i16 q0, q0, q7 + vmov q12, q13 +.ifb \t + vqmovun.s16 d0, q0 +.endif + vmov q13, q14 + vmov q14, q15 + subs r10, r10, #1 +.ifnb \t + vst1.16 {q0}, [r0, :128], r1 +.else + vst1.8 {d0}, [r0, :64], r1 +.endif + + add r6, r6, r4 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +warp , 11 +warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.8 {d0[], d1[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4 // out = dst + left_ext + mov r3, r2 +1: + vld1.8 {q0, q1}, [lr]! + subs r3, r3, #32 +.if \need_left + vst1.8 {q0, q1}, [r12]! +.else + vst1.8 {q0, q1}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2 // in + center_w + sub r3, r3, #1 // in + center_w - 1 + add r12, r6, r4 // dst + left_ext + vld1.8 {d0[], d1[]}, [r3] + add r12, r12, r2 // out = dst + left_ext + center_w + mov r3, r11 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 +1: + vld1.8 {q0, q1}, [r8, :128]! + mov r3, r10 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride +1: + vld1.8 {q0, q1}, [r0, :128]! + mov r3, r5 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S new file mode 100644 index 0000000000..b7d845e219 --- /dev/null +++ b/third_party/dav1d/src/arm/32/mc16.S @@ -0,0 +1,3658 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + vqadd.s16 q0, q0, q2 + vqadd.s16 q1, q1, q3 + vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) + vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q4 + vmul.s32 \d1, \d1, q4 + vmul.s32 q1, q1, q4 + vshr.s32 \d0, \d0, #4 + vshr.s32 q0, q0, #4 + vshr.s32 \d1, \d1, #4 + vshr.s32 q1, q1, #4 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro mask d0, d00, d01, d1, d10, d11 + vld1.8 {q7}, [r6, :128]! + vld1.16 {q0, q1}, [r2, :128]! + vneg.s8 q7, q7 + vld1.16 {q2, q3}, [r3, :128]! + vmovl.s8 q6, d14 + vmovl.s8 q7, d15 + vmovl.s16 q4, d12 + vmovl.s16 q5, d13 + vmovl.s16 q6, d14 + vmovl.s16 q7, d15 + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q5 + vmul.s32 \d1, \d1, q6 + vmul.s32 q1, q1, q7 + vshr.s32 \d0, \d0, #6 + vshr.s32 q0, q0, #6 + vshr.s32 \d1, \d1, #6 + vshr.s32 q1, q1, #6 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldr r6, [sp, #28] + clz r4, r4 +.ifnc \type, avg + ldr r7, [sp, #32] + vmov.i16 q14, #0 + vdup.16 q15, r7 // bitdepth_max +.endif +.ifc \type, w_avg + vpush {q4} +.endif +.ifc \type, mask + vpush {q4-q7} +.endif + clz r7, \bdmax + sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov lr, #1 + movw r12, #2*PREP_BIAS + lsl lr, lr, r7 // 1 << intermediate_bits + neg r12, r12 // -2*PREP_BIAS + add r7, r7, #1 + sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits + neg r7, r7 // -(intermediate_bits+1) + vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits + vdup.16 q13, r7 // -(intermediate_bits+1) +.else + mov r12, #PREP_BIAS + lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits + neg r7, r7 // -intermediate_bits + vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits + vdup.16 q13, r7 // -intermediate_bits +.endif +.ifc \type, w_avg + vdup.32 q4, r6 + vneg.s32 q4, q4 +.endif + adr r7, L(\type\()_tbl) + sub r4, r4, #24 + \type q8, d16, d17, q9, d18, d19 + ldr r4, [r7, r4, lsl #2] + add r7, r7, r4 + bx r7 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_tbl) + CONFIG_THUMB + +40: + add r7, r0, r1 + lsl r1, r1, #1 +4: + subs r5, r5, #4 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r7, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r7, :64], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 4b +80: + add r7, r0, r1 + lsl r1, r1, #1 +8: + vst1.16 {q8}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q9}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 8b +160: +16: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q10, q11}, [r0, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 16b +320: + add r7, r0, #32 +32: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 32b +640: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #64 +64: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 64b +1280: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #192 +128: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 128b +0: +.ifc \type, mask + vpop {q4-q7} +.endif +.ifc \type, w_avg + vpop {q4} +.endif + pop {r4-r7,pc} +endfunc +.endm + +bidir_fn avg, r6 +bidir_fn w_avg, r7 +bidir_fn mask, r7 + + +.macro w_mask_fn type +function w_mask_\type\()_16bpc_neon, export=1 + push {r4-r10,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + ldr r8, [sp, #112] + clz r9, r4 + adr lr, L(w_mask_\type\()_tbl) + vdup.16 q15, r8 // bitdepth_max + sub r9, r9, #24 + clz r8, r8 // clz(bitdepth_max) + ldr r9, [lr, r9, lsl #2] + add r9, lr, r9 + sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 + mov r10, #PREP_BIAS*64 + neg r8, r8 // -sh + movw r12, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd + vdup.32 q14, r8 // -sh + vdup.16 q0, r12 +.if \type == 444 + vmov.i8 q1, #64 +.elseif \type == 422 + vdup.8 d4, r7 + vmov.i8 d2, #129 + vsub.i16 d2, d2, d4 +.elseif \type == 420 + vdup.16 q2, r7 + vmov.i16 q1, #0x100 + vsub.i16 q1, q1, q2 +.endif + add r12, r0, r1 + lsl r1, r1, #1 + bx r9 + + .align 2 +L(w_mask_\type\()_tbl): + .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB + +4: + vld1.16 {q2, q3}, [r2, :128]! // tmp1 (four rows at once) + vld1.16 {q4, q5}, [r3, :128]! // tmp2 (four rows at once) + subs r5, r5, #4 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition) + vadd.i16 d13, d14, d15 + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r12, :64], r1 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r12, :64], r1 + bgt 4b + vpop {q4-q7} + pop {r4-r10,pc} +8: + vld1.16 {q2, q3}, [r2, :128]! // tmp1 + vld1.16 {q4, q5}, [r3, :128]! // tmp2 + subs r5, r5, #2 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r10,pc} +1280: +640: +320: +160: + sub r1, r1, r4, lsl #1 +.if \type == 444 + add lr, r6, r4 +.elseif \type == 422 + add lr, r6, r4, lsr #1 +.endif + add r7, r2, r4, lsl #1 + add r9, r3, r4, lsl #1 +161: + mov r8, r4 +16: + vld1.16 {q2}, [r2, :128]! // tmp1 + vld1.16 {q4}, [r3, :128]! // tmp2 + vld1.16 {q3}, [r7, :128]! + vld1.16 {q5}, [r9, :128]! + subs r8, r8, #8 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {d12}, [r6, :64]! + vst1.8 {d13}, [lr, :64]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.32 {d12[0]}, [r6, :32]! + vst1.32 {d12[1]}, [lr, :32]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + bgt 16b + subs r5, r5, #2 + add r2, r2, r4, lsl #1 + add r3, r3, r4, lsl #1 + add r7, r7, r4, lsl #1 + add r9, r9, r4, lsl #1 +.if \type == 444 + add r6, r6, r4 + add lr, lr, r4 +.elseif \type == 422 + add r6, r6, r4, lsr #1 + add lr, lr, r4, lsr #1 +.endif + add r0, r0, r1 + add r12, r12, r1 + bgt 161b + vpop {q4-q7} + pop {r4-r10,pc} +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + +function blend_16bpc_neon, export=1 + push {r4-r5,lr} + ldrd r4, r5, [sp, #12] + clz lr, r3 + adr r3, L(blend_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_tbl): + .word 320f - L(blend_tbl) + CONFIG_THUMB + .word 160f - L(blend_tbl) + CONFIG_THUMB + .word 80f - L(blend_tbl) + CONFIG_THUMB + .word 40f - L(blend_tbl) + CONFIG_THUMB + +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld1.8 {d4}, [r5, :64]! + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vneg.s8 d4, d4 // -m + subs r4, r4, #2 + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld1.8 {q8}, [r5, :128]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + subs r4, r4, #2 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: + add r12, r0, #32 +32: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #1 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + +function blend_h_16bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + movrel r5, X(obmc_masks) + add r5, r5, r4 + sub r4, r4, r4, lsr #2 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 + + .align 2 +L(blend_h_tbl): + .word 1280f - L(blend_h_tbl) + CONFIG_THUMB + .word 640f - L(blend_h_tbl) + CONFIG_THUMB + .word 320f - L(blend_h_tbl) + CONFIG_THUMB + .word 160f - L(blend_h_tbl) + CONFIG_THUMB + .word 80f - L(blend_h_tbl) + CONFIG_THUMB + .word 40f - L(blend_h_tbl) + CONFIG_THUMB + .word 20f - L(blend_h_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 +2: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {d2}, [r2, :64]! + vext.8 d4, d4, d5, #6 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.32 {d0[]}, [r0, :32] + vld1.32 {d0[1]}, [r12, :32] + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r12, :32], r1 + bgt 2b + pop {r4-r5,pc} +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {q1}, [r2, :128]! + vext.8 d4, d4, d5, #4 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld2.8 {d16[], d17[]}, [r5, :16]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + subs r4, r4, #2 + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vld1.16 {q1}, [r12, :128] + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld2.8 {d24[], d25[]}, [r5, :16]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q13, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vmovl.s8 q12, d26 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q13, d27 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q13 + vqrdmulh.s16 q11, q11, q13 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vadd.i16 q3, q3, q11 + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +1280: +640: +320: + sub r1, r1, r3, lsl #1 +321: + vld1.8 {d24[]}, [r5]! + mov r12, r3 + vneg.s8 d24, d24 // -m + vmovl.s8 q12, d24 + vshl.i16 q12, q12, #9 // -m << 9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r12, r12, #32 + vld1.16 {q10, q11}, [r2, :128]! + vld1.16 {q2, q3}, [r0, :128] + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + sub r0, r0, #32 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 q11, q11, q12 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + subs r4, r4, #1 + add r0, r0, r1 + bgt 321b + pop {r4-r5,pc} +endfunc + +function blend_v_16bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 + adr r3, L(blend_v_tbl) + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 + bx r3 + + .align 2 +L(blend_v_tbl): + .word 320f - L(blend_v_tbl) + CONFIG_THUMB + .word 160f - L(blend_v_tbl) + CONFIG_THUMB + .word 80f - L(blend_v_tbl) + CONFIG_THUMB + .word 40f - L(blend_v_tbl) + CONFIG_THUMB + .word 20f - L(blend_v_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 + vld1.8 {d4[]}, [lr] + vneg.s8 d4, d4 // -m + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 +2: + vld1.32 {d2[]}, [r2, :32]! + vld1.16 {d0[]}, [r0, :16] + subs r4, r4, #2 + vld1.16 {d2[1]}, [r2, :16] + vld1.16 {d0[1]}, [r12, :16] + add r2, r2, #4 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r12, :16], r1 + bgt 2b + pop {r4,pc} +40: + vld1.32 {d4[]}, [lr, :32] + add r12, r0, r1 + vneg.s8 d4, d4 // -m + lsl r1, r1, #1 + vmovl.s8 q2, d4 + sub r1, r1, #4 + vshl.i16 q2, q2, #9 // -m << 9 +4: + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + subs r4, r4, #2 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.32 {d0[0]}, [r0, :32]! + vst1.32 {d1[0]}, [r12, :32]! + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d1[2]}, [r12, :16], r1 + bgt 4b + pop {r4,pc} +80: + vld1.8 {d16}, [lr, :64] + add r12, r0, r1 + vneg.s8 d16, d16 // -m + lsl r1, r1, #1 + vmovl.s8 q8, d16 + sub r1, r1, #8 + vshl.i16 q8, q8, #9 // -m << 9 +8: + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + subs r4, r4, #2 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q8 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {d0}, [r0, :64]! + vst1.16 {d2}, [r12, :64]! + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d3[0]}, [r12, :32], r1 + bgt 8b + pop {r4,pc} +160: + vld1.8 {q12}, [lr, :128] + add r12, r0, r1 + vneg.s8 q13, q12 // -m + lsl r1, r1, #1 + vmovl.s8 q12, d26 + vmovl.s8 q13, d27 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 d26, d26, #9 +16: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {d0, d1, d2}, [r0, :64] + subs r4, r4, #2 + vld1.16 {q10, q11}, [r2, :128]! + vsub.i16 q8, q0, q8 // a - b + vld1.16 {d4, d5, d6}, [r12, :64] + vsub.i16 d18, d2, d18 + vsub.i16 q10, q2, q10 + vsub.i16 d22, d6, d22 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 d18, d18, d26 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 d22, d22, d26 + vadd.i16 q0, q0, q8 + vadd.i16 d2, d2, d18 + vadd.i16 q2, q2, q10 + vst1.16 {d0, d1, d2}, [r0, :64], r1 + vadd.i16 d6, d6, d22 + vst1.16 {d4, d5, d6}, [r12, :64], r1 + bgt 16b + pop {r4,pc} +320: + vld1.8 {d24, d25, d26}, [lr, :64] + vneg.s8 q14, q12 // -m + vneg.s8 d30, d26 + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + sub r1, r1, #32 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r4, r4, #1 + vld1.16 {q10}, [r2, :128] + vsub.i16 q8, q0, q8 // a - b + vld1.16 {q2}, [r0, :128] + sub r0, r0, #32 + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + add r2, r2, #32 + vst1.16 {q2}, [r0, :128], r1 + bgt 32b + pop {r4,pc} +endfunc + +// This has got the same signature as the put_8tap functions, +// and assumes that r9 is set to (clz(w)-24). +function put_neon + adr r10, L(put_tbl) + ldr r9, [r10, r9, lsl #2] + add r10, r10, r9 + bx r10 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 320f - L(put_tbl) + CONFIG_THUMB + .word 16f - L(put_tbl) + CONFIG_THUMB + .word 80f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.16 {d0}, [r2], r3 + vld1.16 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r0, :64], r1 + bgt 4b + pop {r4-r11,pc} +80: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +8: + vld1.16 {q0}, [r2], r3 + vld1.16 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r8, :128], r1 + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q0, q1}, [r0, :128], r1 + bgt 16b + pop {r4-r11,pc} +320: + sub r1, r1, #32 + sub r3, r3, #32 +32: + vld1.16 {q0, q1}, [r2]! + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #96 + sub r3, r3, #96 +64: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #224 + sub r3, r3, #224 +128: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2]! + vst1.16 {q14, q15}, [r0, :128]! + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + +// This has got the same signature as the prep_8tap functions, +// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and +// r8 to w*2. +function prep_neon + adr r10, L(prep_tbl) + ldr r9, [r10, r9, lsl #2] + vdup.16 q15, r7 // intermediate_bits + vmov.i16 q14, #PREP_BIAS + add r10, r10, r9 + bx r10 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 16f - L(prep_tbl) + CONFIG_THUMB + .word 80f - L(prep_tbl) + CONFIG_THUMB + .word 40f - L(prep_tbl) + CONFIG_THUMB + +40: + add r9, r1, r2 + lsl r2, r2, #1 +4: + vld1.16 {d0}, [r1], r2 + vld1.16 {d1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vsub.i16 q0, q0, q14 + vst1.16 {q0}, [r0, :128]! + bgt 4b + pop {r4-r11,pc} +80: + add r9, r1, r2 + lsl r2, r2, #1 +8: + vld1.16 {q0}, [r1], r2 + vld1.16 {q1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vshl.s16 q1, q1, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r1], r2 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + subs r4, r4, #2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 16b + pop {r4-r11,pc} +320: + sub r2, r2, #32 +32: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #96 +64: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #224 +128: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1]! + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q0, q1}, [r1]! + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vst1.16 {q2, q3}, [r0, :128]! + vld1.16 {q2, q3}, [r1]! + vsub.i16 q11, q11, q14 + vshl.s16 q0, q0, q15 + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q8, q9}, [r1]! + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 128b + pop {r4-r11,pc} +endfunc + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.16 {\d0}, [\s0], \strd + vld1.16 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.16 {\d2}, [\s0], \strd + vld1.16 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.16 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.16 {\d6}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 + vld1.16 {\d0, \d1}, [\s0], \strd +.ifnb \d2 + vld1.16 {\d2, \d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4, \d5}, [\s0], \strd +.endif +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmin_u16 c, r0, r1, r2, r3 + vmin.u16 \r0, \r0, \c +.ifnb \r1 + vmin.u16 \r1, \r1, \c +.endif +.ifnb \r2 + vmin.u16 \r2, \r2, \c + vmin.u16 \r3, \r3, \c +.endif +.endm +.macro vsub_i16 c, r0, r1, r2, r3 + vsub.i16 \r0, \r0, \c +.ifnb \r1 + vsub.i16 \r1, \r1, \c +.endif +.ifnb \r2 + vsub.i16 \r2, \r2, \c + vsub.i16 \r3, \r3, \c +.endif +.endm +.macro vmull_vmlal_4 d, s0, s1, s2, s3 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] +.endm +.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] + vmlal.s16 \d, \s4, d1[0] + vmlal.s16 \d, \s5, d1[1] + vmlal.s16 \d, \s6, d1[2] + vmlal.s16 \d, \s7, d1[3] +.endm +.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s32 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s32 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s32 \d2, \q2, #\shift + vqrshrun.s32 \d3, \q3, #\shift +.endif +.endm +.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 + vmovn.i32 \d0, \q0 +.ifnb \q1 + vmovn.i32 \d1, \q1 +.endif +.ifnb \q2 + vmovn.i32 \d2, \q2 + vmovn.i32 \d3, \q3 +.endif +.endm +.macro vrshl_s32 shift, r0, r1, r2, r3 + vrshl.s32 \r0, \r0, \shift + vrshl.s32 \r1, \r1, \shift +.ifnb \r2 + vrshl.s32 \r2, \r2, \shift + vrshl.s32 \r3, \r3, \shift +.endif +.endm +.macro vst1_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r9, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r9, :32], \strd +.endif +.endm +.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.16 {\r0}, [r0, \align], \strd + vst1.16 {\r1}, [r9, \align], \strd +.ifnb \r2 + vst1.16 {\r2}, [r0, \align], \strd + vst1.16 {\r3}, [r9, \align], \strd +.endif +.ifnb \r4 + vst1.16 {\r4}, [r0, \align], \strd + vst1.16 {\r5}, [r9, \align], \strd + vst1.16 {\r6}, [r0, \align], \strd + vst1.16 {\r7}, [r9, \align], \strd +.endif +.endm +.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 +.ifc \type, put + vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vmin_u16 q15, \q0, \q1 +.else + vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) + vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vsub_i16 q15, \q0, \q1 // PREP_BIAS +.endif +.endm +.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :64, \d0, \d1, \d2, \d3 +.endm +.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :128, \q0, \q1 +.endm +.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1.16 {\q0, \q1}, [r0, :128], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + movw r9, \type_h + movw r10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r11 + mul \my, \my, r11 + add \mx, \mx, r9 // mx, 8tap_h, 4tap_h + add \my, \my, r10 // my, 8tap_v, 4tap_v + +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + vdup.16 q15, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + tst \mx, #(0x7f << 14) + sub r9, r9, #24 + add lr, \bdmax, #6 // 6 + intermediate_bits + rsb r12, \bdmax, #6 // 6 - intermediate_bits + movrel r11, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r10, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r10 + tst \my, #(0x7f << 14) + add \mx, r11, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r10, L(\type\()_8tap_h_tbl) + vdup.32 q14, r12 // 6 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s32 q14, q14 // -(6-intermediate_bits) +.ifc \type, put + vdup.16 q13, \bdmax // intermediate_bits +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q13, q13 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.16 {q2}, [\src], \s_strd + vld1.16 {q3}, [\sr2], \s_strd + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmull.s16 q1, d4, d0[0] + vmlal.s16 q1, d5, d0[1] + vmlal.s16 q1, d6, d0[2] + vmlal.s16 q1, d7, d0[3] + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vqmovun.s32 d2, q1 + vrshl.s16 d2, d2, d26 // -intermediate_bits + vmin.u16 d2, d2, d30 + vst1.32 {d2[0]}, [\dst, :32], \d_strd + vst1.32 {d2[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q11}, [\sr2], \s_strd + vext.8 d18, d16, d17, #2 + vext.8 d19, d16, d17, #4 + vext.8 d20, d16, d17, #6 + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d21, d22, d23, #6 + subs \h, \h, #2 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmlal.s16 q2, d20, d0[3] + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d4, q2 + vmovn.s32 d5, q3 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + vpush {q4-q5} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +81: + vld1.16 {q8, q9}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + mov \mx, \w + +8: + vmull.s16 q1, d16, d0[0] + vmull.s16 q2, d17, d0[0] + vmull.s16 q3, d20, d0[0] + vmull.s16 q4, d21, d0[0] +.irpc i, 1234567 + vext.8 q12, q8, q9, #(2*\i) + vext.8 q5, q10, q11, #(2*\i) +.if \i < 4 + vmlal.s16 q1, d24, d0[\i] + vmlal.s16 q2, d25, d0[\i] + vmlal.s16 q3, d10, d0[\i] + vmlal.s16 q4, d11, d0[\i] +.else + vmlal.s16 q1, d24, d1[\i-4] + vmlal.s16 q2, d25, d1[\i-4] + vmlal.s16 q3, d10, d1[\i-4] + vmlal.s16 q4, d11, d1[\i-4] +.endif +.endr + subs \mx, \mx, #8 + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d2, q1 + vqmovun.s32 d3, q2 + vqmovun.s32 d4, q3 + vqmovun.s32 d5, q4 + vrshl.s16 q1, q1, q13 // -intermediate_bits + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q1, q1, q15 + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d2, q1 + vmovn.s32 d3, q2 + vmovn.s32 d4, q3 + vmovn.s32 d5, q4 + vsub.i16 q1, q1, q13 // PREP_BIAS + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {q1}, [\dst, :128]! + vst1.16 {q2}, [\ds2, :128]! + ble 9f + + vmov q8, q9 + vmov q10, q11 + vld1.16 {q9}, [\src]! + vld1.16 {q11}, [\sr2]! + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 81b + vpop {q4-q5} + pop {r4-r11,pc} + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 + add \my, r11, \my, lsl #3 + +.ifc \type, prep + vdup.32 q14, r12 // 6 - intermediate_bits + vmov.i16 q15, #PREP_BIAS +.endif + adr r10, L(\type\()_8tap_v_tbl) + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vneg.s32 q14, q14 // -(6-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + bgt 24f + vmull_vmlal_4 q8, d1, d2, d3, d4 + vqrshrun_s32 6, q8, d16 + vmin_u16 d30, d16 + vst1_32 \d_strd, d16 + pop {r4-r11,pc} + +24: // 2x4 v + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d3, d4, d5, d6 + vqrshrun_s32 6, q8, d16, q9, d17 + vmin_u16 q15, q8 + vst1_32 \d_strd, d16, d17 + pop {r4-r11,pc} + +28: // 2x6, 2x8, 2x12, 2x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 + interleave_1_32 d2, d3, d4, d5, d6 + interleave_1_32 d6, d7, d16 +216: + subs \h, \h, #4 + load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 + interleave_1_32 d16, d17, d18, d19, d20 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 + vqrshrun_s32 6, q13, d26, q1, d27 + vmin_u16 q15, q13 + vst1_32 \d_strd, d26, d27 + ble 0f + cmp \h, #2 + vmov q1, q3 + vmov q2, q8 + vmov q3, q9 + vmov d16, d20 + beq 26f + b 216b +26: + load_32 \sr2, \src, \s_strd, d17, d18 + interleave_1_32 d16, d17, d18 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vqrshrun_s32 6, q13, d26 + vmin_u16 d30, d26 + vst1_32 \d_strd, d26 +0: + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d2, d3, d4, d5 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmull_vmlal_4 q8, d3, d4, d5, d6 + vmull_vmlal_4 q9, d4, d5, d6, d7 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 +0: + pop {r4-r11,pc} + +480: // 4x6, 4x8, 4x12, 4x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 + +48: + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 + vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 + shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 + ble 0f + cmp \h, #2 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov d22, d26 + beq 46f + b 48b +46: + load_reg \sr2, \src, \s_strd, d23, d24 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + shift_store_4 \type, \d_strd, q1, q2, d2, d3 +0: + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 + vmull_vmlal_4 q10, d2, d4, d6, d16 + vmull_vmlal_4 q11, d3, d5, d7, d17 + vmull_vmlal_4 q12, d4, d6, d16, d18 + vmull_vmlal_4 q13, d5, d7, d17, d19 + shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmull_vmlal_4 q1, d6, d16, d18, d20 + vmull_vmlal_4 q2, d7, d17, d19, d21 + vmull_vmlal_4 q12, d16, d18, d20, d22 + vmull_vmlal_4 q13, d17, d19, d21, d23 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q12, q13 + vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 + vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 + vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 + vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q1, q2 + vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 + vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 + vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 + vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 + shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 + ble 9f + vmov q5, q9 + vmov q6, q10 + vmov q7, q11 + vmov q8, q12 + vmov q9, q13 + vmov q10, q1 + vmov q11, q2 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + vpush {q6-q7} + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + + load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 +16: + load_16s16 \src, \src, \s_strd, q12, q13 + subs \h, \h, #1 + vmull_vmlal_4 q1, d12, d16, d20, d24 + vmull_vmlal_4 q2, d13, d17, d21, d25 + vmull_vmlal_4 q3, d14, d18, d22, d26 + vmull_vmlal_4 q6, d15, d19, d23, d27 + shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 + ble 0f + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov q11, q13 + b 16b +0: + vpop {q6-q7} + pop {r4-r11,pc} + + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 +4: + add \my, r11, \my, lsl #3 + + adr r10, L(\type\()_8tap_hv_tbl) + neg r12, r12 // -(6-intermediate_bits) + ldr r9, [r10, r9, lsl #2] + vdup.32 q14, r12 // -(6-intermediate_bits) +.ifc \type, put + neg r8, lr // -(6+intermeidate_bits) +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vdup.32 q13, r8 // -(6+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d24, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d24, d2[3] + + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vmin.u16 d4, d4, d30 + subs \h, \h, #2 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d24 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d24, #4 + vmov d19, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d24, #4 + vmov d21, d24 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d24, #4 + vmull.s16 q3, d16, d2[0] + vmlal.s16 q3, d17, d2[1] + vmlal.s16 q3, d18, d2[2] + vmlal.s16 q3, d19, d2[3] + vmlal.s16 q3, d20, d3[0] + vmlal.s16 q3, d21, d3[1] + vmlal.s16 q3, d22, d3[2] + vmlal.s16 q3, d24, d3[3] + + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d6, q3 + vmin.u16 d6, d6, d30 + subs \h, \h, #2 + vst1.32 {d6[0]}, [\dst, :32], \d_strd + vst1.32 {d6[1]}, [\ds2, :32], \d_strd + ble 0f + vmov q8, q9 + vmov q9, q10 + vmov d20, d22 + vmov d21, d24 + b 28b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.16 {q11}, [\sr2], \s_strd + vld1.16 {q12}, [\src], \s_strd + vext.8 d23, d22, d23, #2 + vext.8 d25, d24, d25, #2 + vtrn.32 q11, q12 + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d23, d0[1] + vmlal.s16 q3, d24, d0[2] + vmlal.s16 q3, d25, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d17, q10 + + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d17, d2[0] + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q2, d19, d2[2] + vmlal.s16 q2, d24, d2[3] + vmull.s16 q3, d18, d2[0] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q3, d24, d2[2] + vmlal.s16 q3, d25, d2[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d17, d19 + vmov q9, q12 + b 4b +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16, 4x32 hv + vpush {d13-d15} + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d13, q10 + + bl L(\type\()_8tap_filter_4) + vmov q7, q12 + bl L(\type\()_8tap_filter_4) + vmov q8, q12 + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d13, d2[0] + vmlal.s16 q2, d14, d2[1] + vmlal.s16 q2, d15, d2[2] + vmlal.s16 q2, d16, d2[3] + vmlal.s16 q2, d17, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q2, d19, d3[2] + vmlal.s16 q2, d24, d3[3] + vmull.s16 q3, d14, d2[0] + vmlal.s16 q3, d15, d2[1] + vmlal.s16 q3, d16, d2[2] + vmlal.s16 q3, d17, d2[3] + vmlal.s16 q3, d18, d3[0] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q3, d24, d3[2] + vmlal.s16 q3, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d13, d15 + vmov q7, q8 + vmov q8, q9 + vmov q9, q12 + b 48b +0: + vpop {d13-d15} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d20, d21, #2 + vext.8 d25, d20, d21, #4 + vext.8 d21, d20, d21, #6 + vmull.s16 q3, d20, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + vmovn.i32 d25, q10 + bx lr + +80: +160: +320: + bgt 880f + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #6 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d16, q2 + vmovn.i32 d17, q3 + + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d16, d2[0] + vmull.s16 q3, d17, d2[0] + vmull.s16 q13, d18, d2[0] + vmull.s16 q14, d19, d2[0] +.ifc \type, put + vdup.32 q8, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q13, d20, d2[1] + vmlal.s16 q14, d21, d2[1] + vmlal.s16 q2, d20, d2[2] + vmlal.s16 q3, d21, d2[2] + vmlal.s16 q13, d22, d2[2] + vmlal.s16 q14, d23, d2[2] + vmlal.s16 q2, d22, d2[3] + vmlal.s16 q3, d23, d2[3] + vmlal.s16 q13, d24, d2[3] + vmlal.s16 q14, d25, d2[3] +.ifc \type, put + vdup.16 q9, \bdmax // bitdepth_max + vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q9, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q9 // PREP_BIAS + vsub.i16 q3, q3, q9 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b +0: + pop {r4-r11,pc} + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d8, q2 + vmovn.i32 d9, q3 + + bl L(\type\()_8tap_filter_8) + vmov q5, q11 + vmov q6, q12 + bl L(\type\()_8tap_filter_8) + vmov q7, q11 + vmov q8, q12 + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d8, d2[0] + vmull.s16 q3, d9, d2[0] + vmull.s16 q13, d10, d2[0] + vmull.s16 q14, d11, d2[0] +.ifc \type, put + vdup.32 q4, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d10, d2[1] + vmlal.s16 q3, d11, d2[1] + vmlal.s16 q13, d12, d2[1] + vmlal.s16 q14, d13, d2[1] + vmlal.s16 q2, d12, d2[2] + vmlal.s16 q3, d13, d2[2] + vmlal.s16 q13, d14, d2[2] + vmlal.s16 q14, d15, d2[2] + vmlal.s16 q2, d14, d2[3] + vmlal.s16 q3, d15, d2[3] + vmlal.s16 q13, d16, d2[3] + vmlal.s16 q14, d17, d2[3] + vmlal.s16 q2, d16, d3[0] + vmlal.s16 q3, d17, d3[0] + vmlal.s16 q13, d18, d3[0] + vmlal.s16 q14, d19, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q13, d20, d3[1] + vmlal.s16 q14, d21, d3[1] + vmlal.s16 q2, d20, d3[2] + vmlal.s16 q3, d21, d3[2] + vmlal.s16 q13, d22, d3[2] + vmlal.s16 q14, d23, d3[2] + vmlal.s16 q2, d22, d3[3] + vmlal.s16 q3, d23, d3[3] + vmlal.s16 q13, d24, d3[3] + vmlal.s16 q14, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q5, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q5 // PREP_BIAS + vsub.i16 q3, q3, q5 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.16 {q13, q14}, [\sr2], \s_strd + vmull.s16 q2, d26, d0[0] + vmull.s16 q3, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d24, d0[\i] + vmlal.s16 q3, d25, d0[\i] +.else + vmlal.s16 q2, d24, d1[\i - 4] + vmlal.s16 q3, d25, d1[\i - 4] +.endif +.endr + vdup.32 q12, r12 // -(6-intermediate_bits) + vld1.16 {q13, q14}, [\src], \s_strd + vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) + vmovn.i32 d4, q2 + vmovn.i32 d5, q3 + + vmull.s16 q3, d26, d0[0] + vmull.s16 q11, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q3, d24, d0[\i] + vmlal.s16 q11, d25, d0[\i] +.else + vmlal.s16 q3, d24, d1[\i - 4] + vmlal.s16 q11, d25, d1[\i - 4] +.endif +.endr + vdup.32 q13, r12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) + vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) + + vmovn.i32 d24, q3 + vmovn.i32 d25, q11 + vmov q11, q2 + bx lr +endfunc + +function \type\()_bilin_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + vdup.16 q1, \mx + vdup.16 q3, \my + rsb r9, \mx, #16 + rsb r10, \my, #16 + vdup.16 q0, r9 + vdup.16 q2, r10 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz \bdmax, \bdmax // bitdepth_max + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + cmp \mx, #0 + sub r9, r9, #24 + rsb r11, \bdmax, #4 // 4 - intermediate_bits + add r12, \bdmax, #4 // 4 + intermediate_bits + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r10, L(\type\()_bilin_h_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.16 q14, \bdmax // intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q14, q14 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.16 {d16}, [\src], \s_strd + vld1.16 {d18}, [\sr2], \s_strd + vext.8 d17, d16, d16, #2 + vext.8 d19, d18, d18, #2 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + subs \h, \h, #2 + vmul.i16 d16, d16, d0 + vmla.i16 d16, d17, d2 + vrshl.u16 d16, d16, d30 + vrshl.u16 d16, d16, d28 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #2 + vext.8 q11, q10, q10, #2 + vmov d17, d20 + vmov d19, d22 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vrshl.u16 q8, q8, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 +.else + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.16 {d16, d17, d18}, [\src], \s_strd + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vext.8 q9, q8, q9, #2 + vext.8 q11, q10, q11, #2 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q11, q1 + vrshl.u16 q8, q8, q15 + vrshl.u16 q10, q10, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q8, q8, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + vpush {q4-q7} + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +161: + vld1.16 {q4}, [\src]! + vld1.16 {q9}, [\sr2]! + mov \mx, \w + +16: + vld1.16 {q5, q6}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + vext.8 q7, q4, q5, #2 + vext.8 q8, q5, q6, #2 + vext.8 q12, q9, q10, #2 + vext.8 q13, q10, q11, #2 + vmul.i16 q4, q4, q0 + vmla.i16 q4, q7, q1 + vmul.i16 q5, q5, q0 + vmla.i16 q5, q8, q1 + vmul.i16 q9, q9, q0 + vmla.i16 q9, q12, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q4, q4, q15 + vrshl.u16 q5, q5, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + subs \mx, \mx, #16 +.ifc \type, put + vrshl.u16 q4, q4, q14 + vrshl.u16 q5, q5, q14 + vrshl.u16 q9, q9, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q4, q4, q14 + vsub.i16 q5, q5, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q4, q5}, [\dst, :128]! + vst1.16 {q9, q10}, [\ds2, :128]! + ble 9f + + vmov q4, q6 + vmov q9, q11 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q7} + pop {r4-r11,pc} + + +L(\type\()_bilin_v): + cmp \h, #4 + adr r10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + vdup.16 q15, r11 // 4 - intermediate_bits +.endif + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vmov.i16 q14, #PREP_BIAS + vneg.s16 q15, q15 // -(4-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.32 {d16[]}, [\src], \s_strd + bgt 24f +22: + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmul.i16 d16, d16, d4 + vmla.i16 d16, d17, d6 + vrshr.u16 d16, d16, #4 + vst1.32 {d16[0]}, [\dst, :32] + vst1.32 {d16[1]}, [\ds2, :32] + pop {r4-r11,pc} +24: // 2x4, 2x6, 2x8, ... v + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vld1.32 {d19[]}, [\sr2], \s_strd + vld1.32 {d20[]}, [\src], \s_strd + subs \h, \h, #4 + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vext.8 d18, d18, d19, #4 + vext.8 d19, d19, d20, #4 + vswp d17, d18 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + cmp \h, #2 + vrshr.u16 q8, q8, #4 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + vst1.32 {d17[0]}, [\dst, :32], \d_strd + vst1.32 {d17[1]}, [\ds2, :32], \d_strd + blt 0f + vmov d16, d20 + beq 22b + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {d16}, [\src], \s_strd +4: + vld1.16 {d17}, [\sr2], \s_strd + vld1.16 {d19}, [\src], \s_strd + vmov d18, d17 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 +.else + vrshl.u16 q8, q8, q15 + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {q8}, [\src], \s_strd +8: + vld1.16 {q9}, [\sr2], \s_strd + vld1.16 {q10}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd + ble 0f + vmov q8, q10 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q8, q9}, [\src], \s_strd +2: + vld1.16 {q10, q11}, [\sr2], \s_strd + vld1.16 {q12, q13}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q10, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q11, q3 + vmul.i16 q10, q10, q2 + vmla.i16 q10, q12, q3 + vmul.i16 q11, q11, q2 + vmla.i16 q11, q13, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vrshr.u16 q10, q10, #4 + vrshr.u16 q11, q11, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + vrshl.u16 q11, q11, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vsub.i16 q11, q11, q14 +.endif + vst1.16 {q8, q9}, [\dst, :128], \d_strd + vst1.16 {q10, q11}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q12 + vmov q9, q13 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + adr r10, L(\type\()_bilin_hv_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.32 q14, r12 // 4 + intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s32 q14, q14 // -(4+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + vext.8 d16, d16, d16, #4 + +2: + vld1.16 {d20}, [\sr2], \s_strd + vld1.16 {d22}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vext.8 d23, d22, d22, #2 + vtrn.32 d20, d22 + vtrn.32 d21, d23 + vmul.i16 d18, d20, d0 + vmla.i16 d18, d21, d2 + vrshl.u16 d18, d18, d30 + + vext.8 d16, d16, d18, #4 + + vmull.u16 q8, d16, d4 + vmlal.u16 q8, d18, d6 + vrshl.u32 q8, q8, q14 + vmovn.i32 d16, q8 + subs \h, \h, #2 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q10}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + +4: + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vext.8 d23, d22, d23, #2 + vswp d21, d22 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vrshl.u16 q9, q9, q15 + + vmull.u16 q10, d16, d4 + vmlal.u16 q10, d18, d6 + vmull.u16 q11, d18, d4 + vmlal.u16 q11, d19, d6 +.ifc \type, put + vrshl.u32 q10, q10, q14 + vrshl.u32 q11, q11, q14 + vmovn.i32 d20, q10 + vmovn.i32 d21, q11 +.else + vrshrn.i32 d20, q10, #4 + vrshrn.i32 d21, q11, #4 + vsub.i16 q10, q10, q14 +.endif + subs \h, \h, #2 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20, d21, d22}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vmul.i16 q8, q10, q0 + vmla.i16 q8, q11, q1 + vrshl.u16 q8, q8, q15 + +2: + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vld1.16 {d24, d25, d26}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vext.8 q13, q12, q13, #2 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vmul.i16 q10, q12, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + + vmull.u16 q11, d16, d4 + vmlal.u16 q11, d18, d6 + vmull.u16 q12, d17, d4 + vmlal.u16 q12, d19, d6 + vmull.u16 q8, d18, d4 + vmlal.u16 q8, d20, d6 + vmull.u16 q9, d19, d4 + vmlal.u16 q9, d21, d6 +.ifc \type, put + vrshl.u32 q11, q11, q14 + vrshl.u32 q12, q12, q14 + vrshl.u32 q8, q8, q14 + vrshl.u32 q9, q9, q14 + vmovn.i32 d22, q11 + vmovn.i32 d23, q12 + vmovn.i32 d16, q8 + vmovn.i32 d17, q9 +.else + vrshrn.i32 d22, q11, #4 + vrshrn.i32 d23, q12, #4 + vrshrn.i32 d16, q8, #4 + vrshrn.i32 d17, q9, #4 + vsub.i16 q11, q11, q14 + vsub.i16 q8, q8, q14 +.endif + subs \h, \h, #2 + vst1.16 {q11}, [\dst, :128], \d_strd + vst1.16 {q8}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 +filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 + +.macro load_filter_ptr src + asr r12, \src, #10 + add r12, r11, r12, lsl #3 +.endm + +.macro load_filter_coef dst, src, inc + add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] +.endm + +.macro load_filter_row dst, src, inc + load_filter_ptr \src + load_filter_coef \dst, \src, \inc +.endm + +function warp_filter_horz_neon + load_filter_ptr r5 // filter 0 + vld1.16 {q6,q7}, [r2], r3 + + load_filter_coef d0, r5, r7 // filter 0 + load_filter_row d2, r5, r7 // filter 1 + vmovl.s8 q0, d0 // filter 0 + vext.8 q3, q6, q7, #2*1 // filter 1 pixels + vmovl.s8 q1, d2 // filter 1 + + vmull.s16 q4, d12, d0 // filter 0 output (0-3) + vmull.s16 q5, d13, d1 // filter 0 output (4-7) + + load_filter_ptr r5 // filter 2 + + vmull.s16 q2, d6, d2 // filter 1 output (0-3) + vmull.s16 q3, d7, d3 // filter 1 output (4-7) + + load_filter_coef d0, r5, r7 // filter 2 + + vpadd.i32 d8, d8, d9 // half pixel 0 (2x32) + vpadd.i32 d9, d10, d11 // half pixel 0 (2x32) + + load_filter_ptr r5 // filter 3 + + vpadd.i32 d4, d4, d5 // half pixel 1 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 1 (2x32) + + vmovl.s8 q0, d0 // filter 2 + vext.8 q3, q6, q7, #2*2 // filter 2 pixels + + vpadd.i32 d8, d8, d9 // pixel 0 (2x32) + vpadd.i32 d9, d4, d5 // pixel 1 (2x32) + + load_filter_coef d2, r5, r7 // filter 3 + + vmull.s16 q2, d6, d0 // filter 2 output (0-3) + vmull.s16 q3, d7, d1 // filter 2 output (4-7) + + load_filter_ptr r5 // filter 4 + + vpadd.i32 d8, d8, d9 // pixel 0,1 + + vpadd.i32 d9, d4, d5 // half pixel 2 (2x32) + vpadd.i32 d10, d6, d7 // half pixel 2 (2x32) + + vmovl.s8 q1, d2 // filter 3 + vext.8 q3, q6, q7, #2*3 // filter 3 pixels + + load_filter_coef d0, r5, r7 // filter 4 + + vpadd.i32 d9, d9, d10 // pixel 2 (2x32) + + vmull.s16 q2, d6, d2 // filter 3 output (0-3) + vmull.s16 q3, d7, d3 // filter 3 output (4-7) + + vmovl.s8 q0, d0 // filter 4 + load_filter_ptr r5 // filter 5 + + vpadd.i32 d10, d4, d5 // half pixel 3 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 3 (2x32) + + vext.8 q3, q6, q7, #2*4 // filter 4 pixels + load_filter_coef d2, r5, r7 // filter 5 + + vpadd.i32 d10, d10, d11 // pixel 3 (2x32) + + vpadd.i32 d9, d9, d10 // pixel 2,3 + + vmull.s16 q2, d6, d0 // filter 4 output (0-3) + vmull.s16 q3, d7, d1 // filter 4 output (4-7) + + vmovl.s8 q1, d2 // filter 5 + load_filter_ptr r5 // filter 6 + + vpadd.i32 d10, d4, d5 // half pixel 4 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 4 (2x32) + + vext.8 q3, q6, q7, #2*5 // filter 5 pixels + load_filter_coef d0, r5, r7 // filter 6 + + vpadd.i32 d10, d10, d11 // pixel 4 (2x32) + + vmull.s16 q2, d6, d2 // filter 5 output (0-3) + vmull.s16 q3, d7, d3 // filter 5 output (4-7) + + vmovl.s8 q0, d0 // filter 6 + load_filter_ptr r5 // filter 7 + + vpadd.i32 d4, d4, d5 // half pixel 5 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 5 (2x32) + + vext.8 q3, q6, q7, #2*6 // filter 6 pixels + load_filter_coef d2, r5, r7 // filter 7 + + vpadd.i32 d11, d4, d5 // pixel 5 (2x32) + + vmull.s16 q2, d6, d0 // filter 6 output (0-3) + vmull.s16 q3, d7, d1 // filter 6 output (4-7) + + vmovl.s8 q1, d2 // filter 7 + + vpadd.i32 d10, d10, d11 // pixel 4,5 + + vpadd.i32 d4, d4, d5 // half pixel 6 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 6 (2x32) + + vext.8 q3, q6, q7, #2*7 // filter 7 pixels + + vpadd.i32 d11, d4, d5 // pixel 6 (2x32) + + vmull.s16 q2, d6, d2 // filter 7 output (0-3) + vmull.s16 q3, d7, d3 // filter 7 output (4-7) + + vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits) + + vpadd.i32 d4, d4, d5 // half pixel 7 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 7 (2x32) + + sub r5, r5, r7, lsl #3 + + vpadd.i32 d4, d4, d5 // pixel 7 (2x32) + + add r5, r5, r8 + + vpadd.i32 d11, d11, d4 // pixel 6,7 + + vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits) + vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits) + + bx lr +endfunc + +// void dav1d_warp_affine_8x8_16bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my, +// const int bitdepth_max) +.macro warp t +function warp_affine_8x8\t\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + sub sp, sp, #8 + + clz r7, r7 + // intermediate_bits = clz(bitdepth_max) - 18 +.ifb \t + sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 +.endif + sub r7, r7, #25 // -(7 - intermediate_bits) +.ifb \t + neg r8, r8 // -(7 + intermediate_bits) +.endif + str r7, [sp] // spill -(7 - intermediate_bits) on stack +.ifb \t + str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack +.endif + + ldrd r8, r9, [r4] + sxth r7, r8 + asr r8, r8, #16 + asr r4, r9, #16 + sxth r9, r9 + mov r10, #8 + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + sub r2, r2, #6 + movrel r11, X(mc_warp_filter), 64*8 +.ifnb \t + lsl r1, r1, #1 +.endif + add r5, r5, #512 + add r6, r6, #512 + + bl warp_filter_horz_neon + vmovn.i32 d16, q4 + vmovn.i32 d17, q5 + bl warp_filter_horz_neon + vmovn.i32 d18, q4 + vmovn.i32 d19, q5 + bl warp_filter_horz_neon + vmovn.i32 d20, q4 + vmovn.i32 d21, q5 + bl warp_filter_horz_neon + vmovn.i32 d22, q4 + vmovn.i32 d23, q5 + bl warp_filter_horz_neon + vmovn.i32 d24, q4 + vmovn.i32 d25, q5 + bl warp_filter_horz_neon + vmovn.i32 d26, q4 + vmovn.i32 d27, q5 + bl warp_filter_horz_neon + vmovn.i32 d28, q4 + vmovn.i32 d29, q5 + +1: + bl warp_filter_horz_neon + vmovn.i32 d30, q4 + vmovn.i32 d31, q5 + + load_filter_row d8, r6, r9 + load_filter_row d9, r6, r9 + load_filter_row d10, r6, r9 + load_filter_row d11, r6, r9 + load_filter_row d12, r6, r9 + load_filter_row d13, r6, r9 + load_filter_row d14, r6, r9 + load_filter_row d15, r6, r9 + transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 + vmovl.s8 q1, d8 + vmovl.s8 q2, d9 + vmovl.s8 q3, d10 + vmovl.s8 q4, d11 + vmovl.s8 q5, d12 + vmovl.s8 q6, d13 + + sub r6, r6, r9, lsl #3 + + // This ordering of vmull/vmlal is highly beneficial for + // Cortex A8/A9/A53 here, but harmful for Cortex A7. + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 + + vmovl.s8 q2, d14 + vmovl.s8 q3, d15 + + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 + +.ifb \t + ldr lr, [sp, #4] // -(7 + intermediate_bits) + ldr r12, [sp, #120] // bitdepth_max + vdup.32 q2, lr // -(7 + intermediate_bits) + vdup.16 q3, r12 // bitdepth_max +.endif + + vmov q8, q9 + vmov q9, q10 +.ifb \t + vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits) + vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits) +.else + vrshrn.s32 d0, q0, #7 + vrshrn.s32 d1, q1, #7 + vmov.i16 q3, #PREP_BIAS +.endif + vmov q10, q11 +.ifb \t + vqmovun.s32 d0, q0 + vqmovun.s32 d1, q1 +.else + vsub.i16 q0, q0, q3 // PREP_BIAS +.endif + vmov q11, q12 + vmov q12, q13 +.ifb \t + vmin.u16 q0, q0, q3 // bitdepth_max +.endif + vmov q13, q14 + vmov q14, q15 + subs r10, r10, #1 + vst1.16 {q0}, [r0, :128], r1 + + add r6, r6, r4 + bgt 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +warp +warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.16 {d0[], d1[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4, lsl #1 // out = dst + left_ext + mov r3, r2 +1: + vld1.16 {q0, q1}, [lr]! + subs r3, r3, #32 + vld1.16 {q2, q3}, [lr]! +.if \need_left + vst1.16 {q0, q1}, [r12]! + vst1.16 {q2, q3}, [r12]! +.else + vst1.16 {q0, q1}, [r12, :128]! + vst1.16 {q2, q3}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2, lsl #1 // in + center_w + sub r3, r3, #2 // in + center_w - 1 + add r12, r6, r4, lsl #1 // dst + left_ext + vld1.16 {d0[], d1[]}, [r3] + add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w + mov r3, r11 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r8, :128]! + mov r3, r10 + vld1.16 {q2, q3}, [r8, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r0, :128]! + mov r3, r5 + vld1.16 {q2, q3}, [r0, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S new file mode 100644 index 0000000000..b06e109dda --- /dev/null +++ b/third_party/dav1d/src/arm/32/msac.S @@ -0,0 +1,575 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 4 +#define DIF 8 +#define RNG 12 +#define CNT 16 +#define ALLOW_UPDATE_CDF 20 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits, align=4 + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro vld1_align_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vld1.16 {\q0}, [\src, :128] +.else + vld1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vld1_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src] +.elseif \n == 8 + vld1.16 {\q0}, [\src] +.else + vld1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vst1_align_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vst1.16 {\q0}, [\src, :128] +.else + vst1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vst1_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src] +.elseif \n == 8 + vst1.16 {\q0}, [\src] +.else + vst1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshr.u16 \d0, \s0, \s3 +.else + vshr.u16 \d1, \s1, \s4 +.if \n == 16 + vshr.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vadd.i16 \d0, \s0, \s3 +.else + vadd.i16 \d1, \s1, \s4 +.if \n == 16 + vadd.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vsub.i16 \d0, \s0, \s3 +.else + vsub.i16 \d1, \s1, \s4 +.if \n == 16 + vsub.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vand \d0, \s0, \s3 +.else + vand \d1, \s1, \s4 +.if \n == 16 + vand \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vcge.u16 \d0, \s0, \s3 +.else + vcge.u16 \d1, \s1, \s4 +.if \n == 16 + vcge.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vrhadd.u16 \d0, \s0, \s3 +.else + vrhadd.u16 \d1, \s1, \s4 +.if \n == 16 + vrhadd.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshl.s16 \d0, \s0, \s3 +.else + vshl.s16 \d1, \s1, \s4 +.if \n == 16 + vshl.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vqdmulh.s16 \d0, \s0, \s3 +.else + vqdmulh.s16 \d1, \s1, \s4 +.if \n == 16 + vqdmulh.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update n + push {r4-r10,lr} + sub sp, sp, #48 + add r8, r0, #RNG + + vld1_align_n d0, q0, q1, r1, \n // cdf + vld1.16 {d16[]}, [r8, :16] // rng + movrel_local r9, coeffs, 30 + vmov.i16 d30, #0x7f00 // 0x7f00 + sub r9, r9, r2, lsl #1 + vmvn.i16 q14, #0x3f // 0xffc0 + add r8, sp, #14 + vand d22, d16, d30 // rng & 0x7f00 + vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng + vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 +.if \n > 4 + vmov d23, d22 +.endif + + vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) + vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r8, r0, #DIF + 2 + + vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) +.if \n == 4 + vmov.i16 d17, #0 +.endif + vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + add r9, sp, #16 + vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) + movrel_local r8, bits + vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access + + vmov d21, d20 + vld1_align_n q12, q12, q13, r8, \n +.if \n == 16 + vmov q11, q10 +.endif + + vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v + + vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask +.if \n == 16 + vadd.i16 q10, q10, q11 +.endif + vadd.i16 d20, d20, d21 // Aggregate mask bits + ldr r4, [r0, #ALLOW_UPDATE_CDF] + vpadd.i16 d20, d20, d20 + lsl r10, r2, #1 + vpadd.i16 d20, d20, d20 + vmov.u16 r3, d20[0] + cmp r4, #0 + rbit r3, r3 + clz lr, r3 // ret + + beq L(renorm) + // update_cdf + ldrh r3, [r1, r10] // count = cdf[n_symbols] + vmov.i8 q10, #0xff +.if \n == 16 + mov r4, #-5 +.else + mvn r12, r2 + mov r4, #-4 + cmn r12, #3 // set C if n_symbols <= 2 +.endif + vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) +.else + lsr r12, r3, #4 // count >> 4 + sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) +.if \n == 4 + vdup.16 d20, r4 // -rate +.else + vdup.16 q10, r4 // -rate +.endif + + sub r3, r3, r3, lsr #5 // count - (count == 32) + vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) + vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate + add r3, r3, #1 // count + (count < 32) + vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate + vst1_align_n d0, q0, q1, r1, \n + strh r3, [r1, r10] +.endm + + decode_update 4 + +L(renorm): + add r8, sp, #16 + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) +L(renorm2): + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 8 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 8 - c + +9: + str r6, [r0, #CNT] + str r7, [r0, #DIF] + + mov r0, lr + add sp, sp, #48 + + pop {r4-r10,pc} +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + push {r4-r10,lr} + vld1.16 {d0}, [r1, :64] // cdf + add r4, r0, #RNG + vmov.i16 d31, #0x7f00 // 0x7f00 + movrel_local r5, coeffs, 30-2*3 + vmvn.i16 d30, #0x3f // 0xffc0 + ldrh r9, [r1, #6] // count = cdf[n_symbols] + vld1.16 {d1[]}, [r4, :16] // rng + movrel_local r4, bits + vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) + add r5, r0, #DIF + 2 + vld1.16 {q8}, [r4, :128] + mov r2, #-24 + vand d20, d0, d30 // cdf & 0xffc0 + ldr r10, [r0, #ALLOW_UPDATE_CDF] + vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + vmov d3, d2 +1: + vand d23, d1, d31 // rng & 0x7f00 + vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r12, sp, #14 + vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) + vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + vmov.i16 d7, #0 + vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng + add r12, sp, #16 + vcge.u16 q2, q1, q3 // c >= v + vst1.16 {q3}, [r12] // store v values to allow indexed access + vand q9, q2, q8 // One bit per halfword set in the mask + + vadd.i16 d18, d18, d19 // Aggregate mask bits + vpadd.i16 d18, d18, d18 + vpadd.i16 d18, d18, d18 + vmov.u16 r3, d18[0] + cmp r10, #0 + add r2, r2, #5 + rbit r3, r3 + add r8, sp, #16 + clz lr, r3 // ret + + beq 2f + // update_cdf + vmov.i8 d22, #0xff + mov r4, #-5 + vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 + sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) + vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) + vdup.16 d18, r4 // -rate + + sub r9, r9, r9, lsr #5 // count - (count == 32) + vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) + vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate + add r9, r9, #1 // count + (count < 32) + vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate + vst1.16 {d0}, [r1, :64] + vand d20, d0, d30 // cdf & 0xffc0 + strh r9, [r1, #6] + +2: + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + vdup.16 d1, r4 + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 40 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 40 - c + +9: + lsl lr, lr, #1 + sub lr, lr, #5 + lsr r12, r7, #16 + adds r2, r2, lr // carry = tok_br < 3 || tok == 15 + vdup.16 q1, r12 + bcc 1b // loop if !carry + add r2, r2, #30 + str r6, [r0, #CNT] + add sp, sp, #48 + str r7, [r0, #DIF] + lsr r0, r2, #1 + pop {r4-r10,pc} +endfunc + +function msac_decode_bool_equi_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + bic r4, r5, #0xff // r &= 0xff00 + add r4, r4, #8 + mov r2, #0 + subs r8, r7, r4, lsl #15 // dif - vw + lsr r4, r4, #1 // v + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + bic r1, r1, #0x3f // f &= ~63 + mul r4, r4, r1 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + push {r4-r10,lr} + ldr r9, [r1] // cdf[0-1] + ldr r5, [r0, #RNG] + movw lr, #0xffc0 + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + and r2, r9, lr // f &= ~63 + mul r4, r4, r2 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + ldr r10, [r0, #ALLOW_UPDATE_CDF] + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + cmp r10, #0 + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + + beq L(renorm2) + + lsr r2, r9, #16 // count = cdf[1] + uxth r9, r9 // cdf[0] + + sub r3, r2, r2, lsr #5 // count - (count >= 32) + lsr r2, r2, #4 // count >> 4 + add r10, r3, #1 // count + (count < 32) + add r2, r2, #4 // rate = (count >> 4) | 4 + + sub r9, r9, lr // cdf[0] -= bit + sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} + asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate + sub r9, r9, r3 // cdf[0] + + strh r9, [r1] + strh r10, [r1, #2] + + b L(renorm2) +endfunc diff --git a/third_party/dav1d/src/arm/32/refmvs.S b/third_party/dav1d/src/arm/32/refmvs.S new file mode 100644 index 0000000000..e16c5448d0 --- /dev/null +++ b/third_party/dav1d/src/arm/32/refmvs.S @@ -0,0 +1,97 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, +// int bx4, int bw4, int bh4) + +function splat_mv_neon, export=1 + push {r4, lr} + vld1.8 {q3}, [r1] + ldr r4, [sp, #8] + clz r3, r3 + adr lr, L(splat_tbl) + sub r3, r3, #26 + vext.8 q2, q3, q3, #12 + ldr r3, [lr, r3, lsl #2] + add r2, r2, r2, lsl #1 + vext.8 q0, q2, q3, #4 + add r3, lr, r3 + vext.8 q1, q2, q3, #8 + lsl r2, r2, #2 + vext.8 q2, q2, q3, #12 + vmov q3, q0 +1: + ldr r1, [r0], #4 + subs r4, r4, #1 + add r1, r1, r2 + bx r3 + + .align 2 +L(splat_tbl): + .word 320f - L(splat_tbl) + CONFIG_THUMB + .word 160f - L(splat_tbl) + CONFIG_THUMB + .word 80f - L(splat_tbl) + CONFIG_THUMB + .word 40f - L(splat_tbl) + CONFIG_THUMB + .word 20f - L(splat_tbl) + CONFIG_THUMB + .word 10f - L(splat_tbl) + CONFIG_THUMB + +10: + vst1.8 {d0}, [r1] + vstr s2, [r1, #8] + bgt 1b + pop {r4, pc} +20: + vst1.8 {q0}, [r1] + vstr d2, [r1, #16] + bgt 1b + pop {r4, pc} +40: + vst1.8 {q0, q1}, [r1]! + vst1.8 {q2}, [r1] + bgt 1b + pop {r4, pc} +320: + vst1.8 {q0, q1}, [r1]! + vst1.8 {q2, q3}, [r1]! + vst1.8 {q1, q2}, [r1]! + vst1.8 {q0, q1}, [r1]! + vst1.8 {q2, q3}, [r1]! + vst1.8 {q1, q2}, [r1]! +160: + vst1.8 {q0, q1}, [r1]! + vst1.8 {q2, q3}, [r1]! + vst1.8 {q1, q2}, [r1]! +80: + vst1.8 {q0, q1}, [r1]! + vst1.8 {q2, q3}, [r1]! + vst1.8 {q1, q2}, [r1] + bgt 1b + pop {r4, pc} +endfunc diff --git a/third_party/dav1d/src/arm/32/util.S b/third_party/dav1d/src/arm/32/util.S new file mode 100644 index 0000000000..c3710d3767 --- /dev/null +++ b/third_party/dav1d/src/arm/32/util.S @@ -0,0 +1,184 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#ifndef DAV1D_SRC_ARM_32_UTIL_S +#define DAV1D_SRC_ARM_32_UTIL_S + +#include "config.h" +#include "src/arm/asm.S" + +.macro movrel_local rd, val, offset=0 +#if defined(PIC) + ldr \rd, 90001f + b 90002f +90001: + .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB) +90002: + add \rd, \rd, pc +#else + movw \rd, #:lower16:\val+\offset + movt \rd, #:upper16:\val+\offset +#endif +.endm + +.macro movrel rd, val, offset=0 +#if defined(PIC) && defined(__APPLE__) + ldr \rd, 1f + b 2f +1: + .word 3f - (2f + 8 - 4 * CONFIG_THUMB) +2: + ldr \rd, [pc, \rd] +.if \offset < 0 + sub \rd, \rd, #-(\offset) +.elseif \offset > 0 + add \rd, \rd, #\offset +.endif + .non_lazy_symbol_pointer +3: + .indirect_symbol \val + .word 0 + .text +#else + movrel_local \rd, \val, \offset +#endif +.endm + +// This macro clobbers r7 (and r12 on windows) and stores data at the +// bottom of the stack; sp is the start of the space allocated that +// the caller can use. +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + +.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \q0, \q2 + vtrn.32 \q1, \q3 + + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 + vswp \d0, \d4 + vswp \d1, \d5 + vswp \d2, \d6 + vswp \d3, \d7 + + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm + +.macro transpose_4x8b q0, q1, r0, r1, r2, r3 + vtrn.16 \q0, \q1 + + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r1, \r4 // vtrn.64 \q0, \q2 + vswp \r3, \r6 // vtrn.64 \q1, \q3 + + vtrn.32 \q0, \q1 + vtrn.32 \q2, \q3 +.endm + +.macro transpose_4x4h q0, q1, r0, r1, r2, r3 + vtrn.32 \q0, \q1 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +.macro transpose_4x8h r0, r1, r2, r3 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +#endif /* DAV1D_SRC_ARM_32_UTIL_S */ diff --git a/third_party/dav1d/src/arm/64/cdef.S b/third_party/dav1d/src/arm/64/cdef.S new file mode 100644 index 0000000000..32b258aba8 --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef.S @@ -0,0 +1,520 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret + tst w7, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + sub \s1, \s1, #2 + sub \s2, \s2, #2 + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr s1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr s3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str \rw\()0, [x0] + str d1, [x0, #2*\w] + add x0, x0, #2*\stride + str \rw\()2, [x0] + str d3, [x0, #2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr h1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr h3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str \rw\()0, [x0] + str s1, [x0, #2*\w] + str s31, [x0, #2*\w+4] + add x0, x0, #2*\stride + str \rw\()2, [x0] + str s3, [x0, #2*\w] + str s31, [x0, #2*\w+4] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr h1, [\s1, #\w] + ldr \rn\()2, [\s2] + ldr h3, [\s2, #\w] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \rw\()2, [x0, #4] + str s3, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \rn\()0, [\s1] + ldr \rn\()1, [\s2] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \rw\()1, [x0, #4] + str s31, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr dst, src, incr, w +.if \w == 4 + ld1 {\dst\().s}[0], [\src], \incr +.else + ld1 {\dst\().8b}, [\src], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func w, stride, rn, rw +function cdef_padding\w\()_8bpc_neon, export=1 + cmp w7, #0xf // fully edged + b.eq cdef_padding\w\()_edged_8bpc_neon + movi v30.8h, #0x80, lsl #8 + mov v31.16b, v30.16b + sub x0, x0, #2*(2*\stride+2) + tst w7, #4 // CDEF_HAVE_TOP + b.ne 1f + // !CDEF_HAVE_TOP + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + b 3f +1: + // CDEF_HAVE_TOP + add x9, x4, x2 + pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 + + // Middle section +3: + tst w7, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ld1 {v0.h}[0], [x3], #2 + ldr h2, [x1, #\w] + load_n_incr v1, x1, x2, \w + subs w6, w6, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + str s0, [x0] + stur \rw\()1, [x0, #4] + str s2, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ld1 {v0.h}[0], [x3], #2 + load_n_incr v1, x1, x2, \w + subs w6, w6, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s0, [x0] + stur \rw\()1, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + b 3f +2: + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldr h1, [x1, #\w] + load_n_incr v0, x1, x2, \w + subs w6, w6, #1 + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr v0, x1, x2, \w + subs w6, w6, #1 + uxtl v0.8h, v0.8b + str s31, [x0] + stur \rw\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + +3: + tst w7, #8 // CDEF_HAVE_BOTTOM + b.ne 1f + // !CDEF_HAVE_BOTTOM + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + ret +1: + // CDEF_HAVE_BOTTOM + add x9, x5, x2 + pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 +endfunc +.endm + +padding_func 8, 16, d, q +padding_func 4, 8, s, d + +// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_edged w, stride, reg +function cdef_padding\w\()_edged_8bpc_neon, export=1 + sub x4, x4, #2 + sub x5, x5, #2 + sub x0, x0, #(2*\stride+2) + +.if \w == 4 + ldr d0, [x4] + ldr d1, [x4, x2] + st1 {v0.8b, v1.8b}, [x0], #16 +.else + add x9, x4, x2 + ldr d0, [x4] + ldr s1, [x4, #8] + ldr d2, [x9] + ldr s3, [x9, #8] + str d0, [x0] + str s1, [x0, #8] + str d2, [x0, #\stride] + str s3, [x0, #\stride+8] + add x0, x0, #2*\stride +.endif + +0: + ld1 {v0.h}[0], [x3], #2 + ldr h2, [x1, #\w] + load_n_incr v1, x1, x2, \w + subs w6, w6, #1 + str h0, [x0] + stur \reg\()1, [x0, #2] + str h2, [x0, #2+\w] + add x0, x0, #\stride + b.gt 0b + +.if \w == 4 + ldr d0, [x5] + ldr d1, [x5, x2] + st1 {v0.8b, v1.8b}, [x0], #16 +.else + add x9, x5, x2 + ldr d0, [x5] + ldr s1, [x5, #8] + ldr d2, [x9] + ldr s3, [x9, #8] + str d0, [x0] + str s1, [x0, #8] + str d2, [x0, #\stride] + str s3, [x0, #\stride+8] +.endif + ret +endfunc +.endm + +padding_func_edged 8, 16, d +padding_func_edged 4, 8, s + +tables + +filter 8, 8 +filter 4, 8 + +find_dir 8 + +.macro load_px_8 d1, d2, w +.if \w == 8 + add x6, x2, w9, sxtb // x + off + sub x9, x2, w9, sxtb // x - off + ld1 {\d1\().d}[0], [x6] // p0 + add x6, x6, #16 // += stride + ld1 {\d2\().d}[0], [x9] // p1 + add x9, x9, #16 // += stride + ld1 {\d1\().d}[1], [x6] // p0 + ld1 {\d2\().d}[1], [x9] // p0 +.else + add x6, x2, w9, sxtb // x + off + sub x9, x2, w9, sxtb // x - off + ld1 {\d1\().s}[0], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[0], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[1], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[1], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[2], [x6] // p0 + add x6, x6, #8 // += stride + ld1 {\d2\().s}[2], [x9] // p1 + add x9, x9, #8 // += stride + ld1 {\d1\().s}[3], [x6] // p0 + ld1 {\d2\().s}[3], [x9] // p1 +.endif +.endm +.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min +.if \min + umin v3.16b, v3.16b, \s1\().16b + umax v4.16b, v4.16b, \s1\().16b + umin v3.16b, v3.16b, \s2\().16b + umax v4.16b, v4.16b, \s2\().16b +.endif + uabd v16.16b, v0.16b, \s1\().16b // abs(diff) + uabd v20.16b, v0.16b, \s2\().16b // abs(diff) + ushl v17.16b, v16.16b, \shift // abs(diff) >> shift + ushl v21.16b, v20.16b, \shift // abs(diff) >> shift + uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) + uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) + cmhi v18.16b, v0.16b, \s1\().16b // px > p0 + cmhi v22.16b, v0.16b, \s2\().16b // px > p1 + umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) + umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) + dup v19.16b, \tap // taps[k] + neg v16.16b, v17.16b // -imin() + neg v20.16b, v21.16b // -imin() + bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() + bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() + mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() + mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() +.endm + +// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint8_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h); +.macro filter_func_8 w, pri, sec, min, suffix +function cdef_filter\w\suffix\()_edged_8bpc_neon +.if \pri + movrel x8, pri_taps + and w9, w3, #1 + add x8, x8, w9, uxtw #1 +.endif + movrel x9, directions\w + add x5, x9, w5, uxtw #1 + movi v30.8b, #7 + dup v28.8b, w6 // damping + +.if \pri + dup v25.16b, w3 // threshold +.endif +.if \sec + dup v27.16b, w4 // threshold +.endif + trn1 v24.8b, v25.8b, v27.8b + clz v24.8b, v24.8b // clz(threshold) + sub v24.8b, v30.8b, v24.8b // ulog2(threshold) + uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) + neg v24.8b, v24.8b // -shift +.if \sec + dup v26.16b, v24.b[1] +.endif +.if \pri + dup v24.16b, v24.b[0] +.endif + +1: +.if \w == 8 + add x12, x2, #16 + ld1 {v0.d}[0], [x2] // px + ld1 {v0.d}[1], [x12] // px +.else + add x12, x2, #1*8 + add x13, x2, #2*8 + add x14, x2, #3*8 + ld1 {v0.s}[0], [x2] // px + ld1 {v0.s}[1], [x12] // px + ld1 {v0.s}[2], [x13] // px + ld1 {v0.s}[3], [x14] // px +.endif + + // We need 9-bits or two 8-bit accululators to fit the sum. + // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. + // Start sum at -1 instead of 0 to help handle rounding later. + movi v1.16b, #255 // sum + movi v2.16b, #0 // sum +.if \min + mov v3.16b, v0.16b // min + mov v4.16b, v0.16b // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov w11, #2 // sec_taps[0] + +2: +.if \pri + ldrb w9, [x5] // off1 + + load_px_8 v5, v6, \w +.endif + +.if \sec + add x5, x5, #4 // +2*2 + ldrb w9, [x5] // off2 + load_px_8 v28, v29, \w +.endif + +.if \pri + ldrb w10, [x8] // *pri_taps + + handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min +.endif + +.if \sec + add x5, x5, #8 // +2*4 + ldrb w9, [x5] // off3 + load_px_8 v5, v6, \w + + handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min + + handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min + + sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; +.else + add x5, x5, #1 // x5 += 1 +.endif + subs w11, w11, #1 // sec_tap-- (value) +.if \pri + add x8, x8, #1 // pri_taps++ (pointer) +.endif + b.ne 2b + + // Perform halving adds since the value won't fit otherwise. + // To handle the offset for negative values, use both halving w/ and w/o rounding. + srhadd v5.16b, v1.16b, v2.16b // sum >> 1 + shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 + cmlt v1.16b, v5.16b, #0 // sum < 0 + bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 + + srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 + + usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 +.if \min + umin v0.16b, v0.16b, v4.16b + umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) +.endif +.if \w == 8 + st1 {v0.d}[0], [x0], x1 + add x2, x2, #2*16 // tmp += 2*tmp_stride + subs w7, w7, #2 // h -= 2 + st1 {v0.d}[1], [x0], x1 +.else + st1 {v0.s}[0], [x0], x1 + add x2, x2, #4*8 // tmp += 4*tmp_stride + st1 {v0.s}[1], [x0], x1 + subs w7, w7, #4 // h -= 4 + st1 {v0.s}[2], [x0], x1 + st1 {v0.s}[3], [x0], x1 +.endif + + // Reset pri_taps and directions back to the original point + sub x5, x5, #2 +.if \pri + sub x8, x8, #2 +.endif + + b.gt 1b + ret +endfunc +.endm + +.macro filter_8 w +filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri +filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec +filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec +.endm + +filter_8 8 +filter_8 4 diff --git a/third_party/dav1d/src/arm/64/cdef16.S b/third_party/dav1d/src/arm/64/cdef16.S new file mode 100644 index 0000000000..ecf864a26d --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef16.S @@ -0,0 +1,229 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +.macro pad_top_bot_16 s1, s2, w, stride, reg, ret + tst w7, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + sub \s1, \s1, #4 + sub \s2, \s2, #4 + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr d1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr d3, [\s2, #2*\w] + str \reg\()0, [x0] + str d1, [x0, #2*\w] + add x0, x0, #2*\stride + str \reg\()2, [x0] + str d3, [x0, #2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr s1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr s3, [\s2, #2*\w] + str \reg\()0, [x0] + str s1, [x0, #2*\w] + str s31, [x0, #2*\w+4] + add x0, x0, #2*\stride + str \reg\()2, [x0] + str s3, [x0, #2*\w] + str s31, [x0, #2*\w+4] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr s1, [\s1, #2*\w] + ldr \reg\()2, [\s2] + ldr s3, [\s2, #2*\w] + str s31, [x0] + stur \reg\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \reg\()2, [x0, #4] + str s3, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ldr \reg\()0, [\s1] + ldr \reg\()1, [\s2] + str s31, [x0] + stur \reg\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + str s31, [x0] + stur \reg\()1, [x0, #4] + str s31, [x0, #4+2*\w] +.if \ret + ret +.else + add x0, x0, #2*\stride +.endif +3: +.endm + +.macro load_n_incr_16 dst, src, incr, w +.if \w == 4 + ld1 {\dst\().4h}, [\src], \incr +.else + ld1 {\dst\().8h}, [\src], \incr +.endif +.endm + +// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, +// const pixel *const bottom, int h, +// enum CdefEdgeFlags edges); + +.macro padding_func_16 w, stride, reg +function cdef_padding\w\()_16bpc_neon, export=1 + movi v30.8h, #0x80, lsl #8 + mov v31.16b, v30.16b + sub x0, x0, #2*(2*\stride+2) + tst w7, #4 // CDEF_HAVE_TOP + b.ne 1f + // !CDEF_HAVE_TOP + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + b 3f +1: + // CDEF_HAVE_TOP + add x9, x4, x2 + pad_top_bot_16 x4, x9, \w, \stride, \reg, 0 + + // Middle section +3: + tst w7, #1 // CDEF_HAVE_LEFT + b.eq 2f + // CDEF_HAVE_LEFT + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ld1 {v0.s}[0], [x3], #4 + ldr s2, [x1, #2*\w] + load_n_incr_16 v1, x1, x2, \w + subs w6, w6, #1 + str s0, [x0] + stur \reg\()1, [x0, #4] + str s2, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + ld1 {v0.s}[0], [x3], #4 + load_n_incr_16 v1, x1, x2, \w + subs w6, w6, #1 + str s0, [x0] + stur \reg\()1, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + b 3f +2: + tst w7, #2 // CDEF_HAVE_RIGHT + b.eq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + ldr s1, [x1, #2*\w] + load_n_incr_16 v0, x1, x2, \w + subs w6, w6, #1 + str s31, [x0] + stur \reg\()0, [x0, #4] + str s1, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + load_n_incr_16 v0, x1, x2, \w + subs w6, w6, #1 + str s31, [x0] + stur \reg\()0, [x0, #4] + str s31, [x0, #4+2*\w] + add x0, x0, #2*\stride + b.gt 1b + +3: + tst w7, #8 // CDEF_HAVE_BOTTOM + b.ne 1f + // !CDEF_HAVE_BOTTOM + st1 {v30.8h, v31.8h}, [x0], #32 +.if \w == 8 + st1 {v30.8h, v31.8h}, [x0], #32 +.endif + ret +1: + // CDEF_HAVE_BOTTOM + add x9, x5, x2 + pad_top_bot_16 x5, x9, \w, \stride, \reg, 1 +endfunc +.endm + +padding_func_16 8, 16, q +padding_func_16 4, 8, d + +tables + +filter 8, 16 +filter 4, 16 + +find_dir 16 diff --git a/third_party/dav1d/src/arm/64/cdef_tmpl.S b/third_party/dav1d/src/arm/64/cdef_tmpl.S new file mode 100644 index 0000000000..d35d7a09ba --- /dev/null +++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S @@ -0,0 +1,511 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro dir_table w, stride +const directions\w + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 + .byte 1 * \stride + 0, 2 * \stride + 0 + .byte 1 * \stride + 0, 2 * \stride - 1 +// Repeated, to avoid & 7 + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 +endconst +.endm + +.macro tables +dir_table 8, 16 +dir_table 4, 8 + +const pri_taps + .byte 4, 2, 3, 3 +endconst +.endm + +.macro load_px d1, d2, w +.if \w == 8 + add x6, x2, w9, sxtb #1 // x + off + sub x9, x2, w9, sxtb #1 // x - off + ld1 {\d1\().8h}, [x6] // p0 + ld1 {\d2\().8h}, [x9] // p1 +.else + add x6, x2, w9, sxtb #1 // x + off + sub x9, x2, w9, sxtb #1 // x - off + ld1 {\d1\().4h}, [x6] // p0 + add x6, x6, #2*8 // += stride + ld1 {\d2\().4h}, [x9] // p1 + add x9, x9, #2*8 // += stride + ld1 {\d1\().d}[1], [x6] // p0 + ld1 {\d2\().d}[1], [x9] // p1 +.endif +.endm +.macro handle_pixel s1, s2, thresh_vec, shift, tap, min +.if \min + umin v2.8h, v2.8h, \s1\().8h + smax v3.8h, v3.8h, \s1\().8h + umin v2.8h, v2.8h, \s2\().8h + smax v3.8h, v3.8h, \s2\().8h +.endif + uabd v16.8h, v0.8h, \s1\().8h // abs(diff) + uabd v20.8h, v0.8h, \s2\().8h // abs(diff) + ushl v17.8h, v16.8h, \shift // abs(diff) >> shift + ushl v21.8h, v20.8h, \shift // abs(diff) >> shift + uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) + uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) + sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px + sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px + neg v16.8h, v17.8h // -clip + neg v20.8h, v21.8h // -clip + smin v18.8h, v18.8h, v17.8h // imin(diff, clip) + smin v22.8h, v22.8h, v21.8h // imin(diff, clip) + dup v19.8h, \tap // taps[k] + smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) + smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) + mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() + mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() +.endm + +// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func w, bpc, pri, sec, min, suffix +function cdef_filter\w\suffix\()_\bpc\()bpc_neon +.if \bpc == 8 + ldr w8, [sp] // edges + cmp w8, #0xf + b.eq cdef_filter\w\suffix\()_edged_8bpc_neon +.endif +.if \pri +.if \bpc == 16 + ldr w9, [sp, #8] // bitdepth_max + clz w9, w9 + sub w9, w9, #24 // -bitdepth_min_8 + neg w9, w9 // bitdepth_min_8 +.endif + movrel x8, pri_taps +.if \bpc == 16 + lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 + and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 +.else + and w9, w3, #1 +.endif + add x8, x8, w9, uxtw #1 +.endif + movrel x9, directions\w + add x5, x9, w5, uxtw #1 + movi v30.4h, #15 + dup v28.4h, w6 // damping + +.if \pri + dup v25.8h, w3 // threshold +.endif +.if \sec + dup v27.8h, w4 // threshold +.endif + trn1 v24.4h, v25.4h, v27.4h + clz v24.4h, v24.4h // clz(threshold) + sub v24.4h, v30.4h, v24.4h // ulog2(threshold) + uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) + neg v24.4h, v24.4h // -shift +.if \sec + dup v26.8h, v24.h[1] +.endif +.if \pri + dup v24.8h, v24.h[0] +.endif + +1: +.if \w == 8 + ld1 {v0.8h}, [x2] // px +.else + add x12, x2, #2*8 + ld1 {v0.4h}, [x2] // px + ld1 {v0.d}[1], [x12] // px +.endif + + movi v1.8h, #0 // sum +.if \min + mov v2.16b, v0.16b // min + mov v3.16b, v0.16b // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov w11, #2 // sec_taps[0] + +2: +.if \pri + ldrb w9, [x5] // off1 + + load_px v4, v5, \w +.endif + +.if \sec + add x5, x5, #4 // +2*2 + ldrb w9, [x5] // off2 + load_px v6, v7, \w +.endif + +.if \pri + ldrb w10, [x8] // *pri_taps + + handle_pixel v4, v5, v25.8h, v24.8h, w10, \min +.endif + +.if \sec + add x5, x5, #8 // +2*4 + ldrb w9, [x5] // off3 + load_px v4, v5, \w + + handle_pixel v6, v7, v27.8h, v26.8h, w11, \min + + handle_pixel v4, v5, v27.8h, v26.8h, w11, \min + + sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; +.else + add x5, x5, #1 // x5 += 1 +.endif + subs w11, w11, #1 // sec_tap-- (value) +.if \pri + add x8, x8, #1 // pri_taps++ (pointer) +.endif + b.ne 2b + + cmlt v4.8h, v1.8h, #0 // -(sum < 0) + add v1.8h, v1.8h, v4.8h // sum - (sum < 0) + srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 + add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 +.if \min + smin v0.8h, v0.8h, v3.8h + smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) +.endif +.if \bpc == 8 + xtn v0.8b, v0.8h +.endif +.if \w == 8 + add x2, x2, #2*16 // tmp += tmp_stride + subs w7, w7, #1 // h-- +.if \bpc == 8 + st1 {v0.8b}, [x0], x1 +.else + st1 {v0.8h}, [x0], x1 +.endif +.else +.if \bpc == 8 + st1 {v0.s}[0], [x0], x1 +.else + st1 {v0.d}[0], [x0], x1 +.endif + add x2, x2, #2*16 // tmp += 2*tmp_stride + subs w7, w7, #2 // h -= 2 +.if \bpc == 8 + st1 {v0.s}[1], [x0], x1 +.else + st1 {v0.d}[1], [x0], x1 +.endif +.endif + + // Reset pri_taps and directions back to the original point + sub x5, x5, #2 +.if \pri + sub x8, x8, #2 +.endif + + b.gt 1b + ret +endfunc +.endm + +.macro filter w, bpc +filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri +filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec +filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec + +function cdef_filter\w\()_\bpc\()bpc_neon, export=1 + cbnz w3, 1f // pri_strength + b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec +1: + cbnz w4, 1f // sec_strength + b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri +1: + b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec +endfunc +.endm + +const div_table + .short 840, 420, 280, 210, 168, 140, 120, 105 +endconst + +const alt_fact + .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 +endconst + +.macro cost_alt d1, d2, s1, s2, s3, s4 + smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] + smull2 v23.4s, \s1\().8h, \s1\().8h + smull v24.4s, \s2\().4h, \s2\().4h + smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] + smull2 v26.4s, \s3\().8h, \s3\().8h + smull v27.4s, \s4\().4h, \s4\().4h + mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact + mla v22.4s, v23.4s, v30.4s + mla v22.4s, v24.4s, v31.4s + mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact + mla v25.4s, v26.4s, v30.4s + mla v25.4s, v27.4s, v31.4s + addv \d1, v22.4s // *cost_ptr + addv \d2, v25.4s // *cost_ptr +.endm + +.macro find_best s1, s2, s3 +.ifnb \s2 + mov w5, \s2\().s[0] +.endif + cmp w4, w1 // cost[n] > best_cost + csel w0, w3, w0, gt // best_dir = n + csel w1, w4, w1, gt // best_cost = cost[n] +.ifnb \s2 + add w3, w3, #1 // n++ + cmp w5, w1 // cost[n] > best_cost + mov w4, \s3\().s[0] + csel w0, w3, w0, gt // best_dir = n + csel w1, w5, w1, gt // best_cost = cost[n] + add w3, w3, #1 // n++ +.endif +.endm + +// Steps for loading and preparing each row +.macro dir_load_step1 s1, bpc +.if \bpc == 8 + ld1 {\s1\().8b}, [x0], x1 +.else + ld1 {\s1\().8h}, [x0], x1 +.endif +.endm + +.macro dir_load_step2 s1, bpc +.if \bpc == 8 + usubl \s1\().8h, \s1\().8b, v31.8b +.else + ushl \s1\().8h, \s1\().8h, v8.8h +.endif +.endm + +.macro dir_load_step3 s1, bpc +// Nothing for \bpc == 8 +.if \bpc != 8 + sub \s1\().8h, \s1\().8h, v31.8h +.endif +.endm + +// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, +// unsigned *const var) +.macro find_dir bpc +function cdef_find_dir_\bpc\()bpc_neon, export=1 +.if \bpc == 16 + str d8, [sp, #-0x10]! + clz w3, w3 // clz(bitdepth_max) + sub w3, w3, #24 // -bitdepth_min_8 + dup v8.8h, w3 +.endif + sub sp, sp, #32 // cost + mov w3, #8 +.if \bpc == 8 + movi v31.16b, #128 +.else + movi v31.8h, #128 +.endif + movi v30.16b, #0 + movi v1.8h, #0 // v0-v1 sum_diag[0] + movi v3.8h, #0 // v2-v3 sum_diag[1] + movi v5.8h, #0 // v4-v5 sum_hv[0-1] + movi v7.8h, #0 // v6-v7 sum_alt[0] + dir_load_step1 v26, \bpc // Setup first row early + movi v17.8h, #0 // v16-v17 sum_alt[1] + movi v18.8h, #0 // v18-v19 sum_alt[2] + dir_load_step2 v26, \bpc + movi v19.8h, #0 + dir_load_step3 v26, \bpc + movi v21.8h, #0 // v20-v21 sum_alt[3] + +.irpc i, 01234567 + addv h25, v26.8h // [y] + rev64 v27.8h, v26.8h + addp v28.8h, v26.8h, v30.8h // [(x >> 1)] + add v5.8h, v5.8h, v26.8h // sum_hv[1] + ext v27.16b, v27.16b, v27.16b, #8 // [-x] + rev64 v29.4h, v28.4h // [-(x >> 1)] + ins v4.h[\i], v25.h[0] // sum_hv[0] +.if \i < 6 + ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) + ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) + add v18.8h, v18.8h, v22.8h // sum_alt[2] + add v19.4h, v19.4h, v23.4h // sum_alt[2] +.else + add v18.8h, v18.8h, v26.8h // sum_alt[2] +.endif +.if \i == 0 + mov v20.16b, v26.16b // sum_alt[3] +.elseif \i == 1 + add v20.8h, v20.8h, v26.8h // sum_alt[3] +.else + ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) + ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) + add v20.8h, v20.8h, v24.8h // sum_alt[3] + add v21.4h, v21.4h, v25.4h // sum_alt[3] +.endif +.if \i == 0 + mov v0.16b, v26.16b // sum_diag[0] + dir_load_step1 v26, \bpc + mov v2.16b, v27.16b // sum_diag[1] + dir_load_step2 v26, \bpc + mov v6.16b, v28.16b // sum_alt[0] + dir_load_step3 v26, \bpc + mov v16.16b, v29.16b // sum_alt[1] +.else + ext v22.16b, v30.16b, v26.16b, #(16-2*\i) + ext v23.16b, v26.16b, v30.16b, #(16-2*\i) + ext v24.16b, v30.16b, v27.16b, #(16-2*\i) + ext v25.16b, v27.16b, v30.16b, #(16-2*\i) +.if \i != 7 // Nothing to load for the final row + dir_load_step1 v26, \bpc // Start setting up the next row early. +.endif + add v0.8h, v0.8h, v22.8h // sum_diag[0] + add v1.8h, v1.8h, v23.8h // sum_diag[0] + add v2.8h, v2.8h, v24.8h // sum_diag[1] + add v3.8h, v3.8h, v25.8h // sum_diag[1] +.if \i != 7 + dir_load_step2 v26, \bpc +.endif + ext v22.16b, v30.16b, v28.16b, #(16-2*\i) + ext v23.16b, v28.16b, v30.16b, #(16-2*\i) + ext v24.16b, v30.16b, v29.16b, #(16-2*\i) + ext v25.16b, v29.16b, v30.16b, #(16-2*\i) +.if \i != 7 + dir_load_step3 v26, \bpc +.endif + add v6.8h, v6.8h, v22.8h // sum_alt[0] + add v7.4h, v7.4h, v23.4h // sum_alt[0] + add v16.8h, v16.8h, v24.8h // sum_alt[1] + add v17.4h, v17.4h, v25.4h // sum_alt[1] +.endif +.endr + + movi v31.4s, #105 + + smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] + smlal2 v26.4s, v4.8h, v4.8h + smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] + smlal2 v27.4s, v5.8h, v5.8h + mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 + mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 + addv s4, v26.4s // cost[2] + addv s5, v27.4s // cost[6] + + rev64 v1.8h, v1.8h + rev64 v3.8h, v3.8h + ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] + ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] + + str s4, [sp, #2*4] // cost[2] + str s5, [sp, #6*4] // cost[6] + + movrel x4, div_table + ld1 {v31.8h}, [x4] + + smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] + smull2 v23.4s, v0.8h, v0.8h + smlal v22.4s, v1.4h, v1.4h + smlal2 v23.4s, v1.8h, v1.8h + smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] + smull2 v25.4s, v2.8h, v2.8h + smlal v24.4s, v3.4h, v3.4h + smlal2 v25.4s, v3.8h, v3.8h + uxtl v30.4s, v31.4h // div_table + uxtl2 v31.4s, v31.8h + mul v22.4s, v22.4s, v30.4s // cost[0] + mla v22.4s, v23.4s, v31.4s // cost[0] + mul v24.4s, v24.4s, v30.4s // cost[4] + mla v24.4s, v25.4s, v31.4s // cost[4] + addv s0, v22.4s // cost[0] + addv s2, v24.4s // cost[4] + + movrel x5, alt_fact + ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 + + str s0, [sp, #0*4] // cost[0] + str s2, [sp, #4*4] // cost[4] + + uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 + uxtl v30.4s, v30.4h + uxtl v31.4s, v31.4h + + cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] + cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] + str s6, [sp, #1*4] // cost[1] + str s16, [sp, #3*4] // cost[3] + + mov w0, #0 // best_dir + mov w1, v0.s[0] // best_cost + mov w3, #1 // n + + str s18, [sp, #5*4] // cost[5] + str s20, [sp, #7*4] // cost[7] + + mov w4, v6.s[0] + + find_best v6, v4, v16 + find_best v16, v2, v18 + find_best v18, v5, v20 + find_best v20 + + eor w3, w0, #4 // best_dir ^4 + ldr w4, [sp, w3, uxtw #2] + sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] + lsr w1, w1, #10 + str w1, [x2] // *var + + add sp, sp, #32 +.if \bpc == 16 + ldr d8, [sp], 0x10 +.endif + ret +endfunc +.endm diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S new file mode 100644 index 0000000000..6cdd7ec5fa --- /dev/null +++ b/third_party/dav1d/src/arm/64/filmgrain.S @@ -0,0 +1,2010 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r2\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r3\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r3\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r4\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r4\().16b, \r5\().8h + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {\r5\().h}[0], [x14] + ld1 {\r5\().h}[1], [x15] + srshl v0.4h, \r5\().4h, v31.4h + xtn \r5\().8b, v0.8h +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +.macro get_grain_row_44 r0, r1, r2 + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r2\().8h + + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn2 \r2\().16b, v0.8h +.endm + +.macro store_grain_row_44 r0, r1, r2 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b}, [x0] + add x0, x0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn v0.8b, v0.8h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + smull v2.8h, v3.8b, v28.8b + smull2 v3.8h, v3.16b, v28.16b + smull v4.8h, v0.8b, v27.8b + smull2 v5.8h, v0.16b, v27.16b + smull v6.8h, v1.8b, v29.8b + smull2 v7.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v4.4h + saddl2 v1.4s, v2.8h, v4.8h + saddl v2.4s, v3.4h, v5.4h + saddl2 v3.4s, v3.8h, v5.8h + saddw v4.4s, v0.4s, v6.4h + saddw2 v5.4s, v1.4s, v6.8h + saddw v6.4s, v2.4s, v7.4h + saddw2 v7.4s, v3.4s, v7.8h + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff + bl sum_\lag\()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.16b, \uv_coeff + smull v2.8h, v0.8b, v1.8b + smull2 v3.8h, v0.16b, v1.16b +.else + smull v2.8h, v0.8b, v30.8b + smull2 v3.8h, v0.16b, v30.16b +.endif + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.b[13] +.endif +.ifnc \lag, lag1 + smov w16, v0.b[14] +.endif + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_\lag\()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_\lag\()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_\lag\()_neon +.endif +.endif +.if \store + st1 {v0.16b}, [x0], #16 +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag1_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + mov v3.16b, \mid\().16b + ext v0.16b, \left\().16b, \mid\().16b, #15 + ext v1.16b, \mid\().16b, \right\().16b, #1 + bl sum_\type\()_lag1_\edge\()_neon + mov \dst\().16b, v0.16b +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v18.16b}, [x12] // load top right + ld1 {v21.16b}, [x13] + + ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid + dup v26.16b, v30.b[0] + ext v23.16b, v16.16b, v17.16b, #15 + dup v27.16b, v30.b[1] + ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v28.16b, v30.b[3] + ext v1.16b, v17.16b, v18.16b, #2 + dup v29.16b, v30.b[4] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v4.8h, v23.8b, v27.8b + smull2 v5.8h, v23.16b, v27.16b + smull v6.8h, v0.8b, v28.8b + smull2 v7.8h, v0.16b, v28.16b + smull v0.8h, v1.8b, v29.8b + smull2 v1.8h, v1.16b, v29.16b + saddl v22.4s, v2.4h, v4.4h + saddl2 v23.4s, v2.8h, v4.8h + saddl v26.4s, v3.4h, v5.4h + saddl2 v27.4s, v3.8h, v5.8h + saddl v2.4s, v0.4h, v6.4h + saddl2 v3.4s, v0.8h, v6.8h + saddl v6.4s, v1.4h, v7.4h + saddl2 v7.4s, v1.8h, v7.8h + add v4.4s, v22.4s, v2.4s + add v5.4s, v23.4s, v3.4s + add v6.4s, v26.4s, v6.4s + add v7.4s, v27.4s, v7.4s + + ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid + dup v26.16b, v30.b[5] + ext v23.16b, v19.16b, v20.16b, #15 + dup v27.16b, v30.b[6] + ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v28.16b, v30.b[8] + ext v1.16b, v20.16b, v21.16b, #2 + dup v29.16b, v30.b[9] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v22.8h, v23.8b, v27.8b + smull2 v23.8h, v23.16b, v27.16b + smull v26.8h, v0.8b, v28.8b + smull2 v27.8h, v0.16b, v28.16b + smull v28.8h, v1.8b, v29.8b + smull2 v29.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v22.4h + saddl2 v1.4s, v2.8h, v22.8h + saddl v2.4s, v3.4h, v23.4h + saddl2 v3.4s, v3.8h, v23.8h + saddl v22.4s, v26.4h, v28.4h + saddl2 v23.4s, v26.8h, v28.8h + saddl v26.4s, v27.4h, v29.4h + saddl2 v27.4s, v27.8h, v29.8h + add v0.4s, v0.4s, v22.4s + add v1.4s, v1.4s, v23.4s + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v27.4s + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + smull v22.8h, v17.8b, v26.8b + smull2 v23.8h, v17.16b, v26.16b + smull v24.8h, v20.8b, v27.8b + smull2 v25.8h, v20.16b, v27.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + saddl v0.4s, v22.4h, v24.4h + saddl2 v1.4s, v22.8h, v24.8h + saddl v2.4s, v23.4h, v25.4h + saddl2 v3.4s, v23.8h, v25.8h + mov v19.16b, v20.16b + mov v20.16b, v21.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag2_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v17.16b}, [x12] // load the previous block right above + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH - 16 + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v15.16b}, [x11] // load top right + ld1 {v18.16b}, [x12] + ld1 {v21.16b}, [x13] + + ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid + dup v22.16b, v29.b[0] + ext v9.16b, v13.16b, v14.16b, #14 + dup v23.16b, v29.b[1] + ext v10.16b, v13.16b, v14.16b, #15 + dup v24.16b, v29.b[2] + dup v25.16b, v29.b[3] + ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right + dup v26.16b, v29.b[4] + ext v12.16b, v14.16b, v15.16b, #2 + dup v27.16b, v29.b[5] + ext v13.16b, v14.16b, v15.16b, #3 + dup v28.16b, v29.b[6] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v14.8b, v25.8b + smull2 v13.8h, v14.16b, v25.16b + add v4.4s, v22.4s, v0.4s + add v5.4s, v23.4s, v1.4s + add v6.4s, v24.4s, v2.4s + add v7.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid + dup v22.16b, v29.b[7] + ext v9.16b, v16.16b, v17.16b, #14 + dup v23.16b, v29.b[8] + ext v10.16b, v16.16b, v17.16b, #15 + dup v24.16b, v29.b[9] + dup v25.16b, v29.b[10] + ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v26.16b, v29.b[11] + ext v12.16b, v17.16b, v18.16b, #2 + dup v27.16b, v29.b[12] + ext v13.16b, v17.16b, v18.16b, #3 + dup v28.16b, v29.b[13] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v17.8b, v25.8b + smull2 v13.8h, v17.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid + dup v22.16b, v29.b[14] + ext v9.16b, v19.16b, v20.16b, #14 + dup v23.16b, v29.b[15] + ext v10.16b, v19.16b, v20.16b, #15 + dup v24.16b, v30.b[0] + dup v25.16b, v30.b[1] + ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v26.16b, v30.b[2] + ext v12.16b, v20.16b, v21.16b, #2 + dup v27.16b, v30.b[3] + ext v13.16b, v20.16b, v21.16b, #3 + dup v28.16b, v30.b[4] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v20.8b, v25.8b + smull2 v19.8h, v20.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + mov v13.16b, v14.16b + mov v14.16b, v15.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + mov v16.16b, v17.16b + mov v17.16b, v18.16b + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v19.4h + saddw2 v7.4s, v7.4s, v19.8h + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag3_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v14.16b}, [x11] // load the previous block right above + ld1 {v17.16b}, [x12] + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row v16, v17, v18, v19, v20, v21 + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row_44 v16, v17, v18 + subs w1, w1, #1 + store_grain_row_44 v16, v17, v18 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row v16, v17, v18, v19, v20, v21 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row_44 v16, v17, v18 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function add_uv_444_coeff_lag0_neon +add_coeff_lag0_start: + smull v2.8h, v0.8b, v27.8b + smull2 v3.8h, v0.16b, v27.16b + srshl v2.8h, v2.8h, v28.8h + srshl v3.8h, v3.8h, v28.8h + saddw v2.8h, v2.8h, v1.8b + saddw2 v3.8h, v3.8h, v1.16b + sqxtn v2.8b, v2.8h + sqxtn2 v2.16b, v3.8h + ret +endfunc + +function add_uv_420_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + ld1 {v6.16b, v7.16b}, [x12], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + saddlp v6.8h, v6.16b + saddlp v7.8h, v7.16b + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + rshrn v4.8b, v4.8h, #2 + rshrn2 v4.16b, v5.8h, #2 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + rshrn v4.8b, v4.8h, #1 + rshrn2 v4.16b, v5.8h, #1 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH + mov x1, x2 + mul w13, w13, w14 +.endif + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #1 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 + bl get_grain_row_neon + and v0.16b, v22.16b, v29.16b + mov v1.16b, v16.16b + bl add_uv_444_coeff_lag0_neon + mov v0.16b, v23.16b + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + ld1 {v26.16b}, [x19], #16 + mov v0.16b, v24.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + add x19, x19, #2 + mov v0.16b, v25.16b + mov v1.16b, v19.16b + mov v18.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + and v0.16b, v26.16b, v30.16b + mov v1.16b, v20.16b + mov v19.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + mov v20.16b, v2.16b + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.16b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + sum_\type\()_lag1 v22, v16, v16, v17, left + sum_\type\()_lag1 v23, v16, v17, v18 + sum_\type\()_lag1 v24, v17, v18, v19 + sum_\type\()_lag1 v25, v18, v19, v20 + sum_\type\()_lag1 v20, v19, v20, v21, right + get_grain_2 v21 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + store_grain_row v22, v23, v24, v25, v20, v21 + mov v16.16b, v22.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + mov v19.16b, v25.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH-3 + mov x1, x2 + mul w13, w13, w14 + + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #7 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH +.endif + mov v0.16b, v29.16b + mov v1.16b, v16.16b + bl add_\type\()_coeff_lag0_neon + movi v0.16b, #255 + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v0.16b, v30.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v18.16b, v2.16b + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v16, v17, v18 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + sum_\type\()_lag1 v20, v16, v16, v17, left + sum_\type\()_lag1 v21, v16, v17, v18 + sum_\type\()_lag1 v18, v17, v18, v18, right + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v20, v21, v18 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0+\off] + umov w15, \src2[8+\off] + umov w16, \src1[2+\off] + add x14, x14, x3 + umov w17, \src2[10+\off] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4+\off] + add x16, x16, x3 + ld1 {\dst2}[8+\off], [x15] + umov w15, \src2[12+\off] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6+\off] + add x14, x14, x3 + ld1 {\dst2}[10+\off], [x17] + umov w17, \src2[14+\off] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[12+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[14+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2 + gather_interleaved \dst1, \dst2, \src1, \src2, 0 + gather_interleaved \dst2, \dst1, \src2, \src1, 0 + gather_interleaved \dst1, \dst2, \src1, \src2, 1 + gather_interleaved \dst2, \dst1, \src2, \src1, 1 +.endm + +function gather32_neon + gather v4.b, v5.b, v0.b, v1.b + ret +endfunc + +function gather16_neon + gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 + gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 + ins v4.d[1], v5.d[1] + ret +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w6, [x6] // offsets[0][0] + ldr w8, [sp, #16] // clip + mov x9, #GRAIN_WIDTH // grain_lut stride + + neg w4, w4 + dup v29.8h, w4 // -scaling_shift + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + + add x5, x5, #9 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #24] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v6.16b, v27.b[0] + dup v7.16b, v27.b[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x8], x9 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v7.8b +.else + smull v16.8h, v18.8b, v7.8b +.endif + smull2 v17.8h, v18.16b, v7.16b + smull v18.8h, v19.8b, v7.8b + smull2 v19.8h, v19.16b, v7.16b +.if \ox + smlal v16.8h, v21.8b, v6.8b +.else + smlal v16.8h, v22.8b, v6.8b +.endif + smlal2 v17.8h, v22.16b, v6.16b + smlal v18.8h, v23.8b, v6.8b + smlal2 v19.8h, v23.16b, v6.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v0.8b // *src + noise + uaddw2 v17.8h, v17.8h, v0.16b + uaddw v18.8h, v18.8h, v1.8b + uaddw2 v19.8h, v19.8h, v1.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w7, w7, #1 +.if \oy + dup v6.16b, v28.b[0] + dup v7.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-32]! + str d8, [sp, #16] + ldp x8, x9, [sp, #32] // offsets, h + ldp x10, x11, [sp, #48] // uv, is_id + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg w13, w13 // -scaling_shift + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + ld1 {v8.h}[0], [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1 {v8.h}[1], [x15] // uv_mult + + dup v29.8h, w13 // -scaling_shift + + cbz w12, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #240 + cbz w11, 2f + // is_id + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH // grain_lut stride + + add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #64] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.16b, #23 + movi v26.16b, #22 +.else + movi v25.16b, #27 + movi v26.16b, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b, v7.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut + +.if !\csfl + uxtl v2.8h, v0.8b + uxtl2 v3.8h, v0.16b + uxtl v4.8h, v1.8b + uxtl2 v5.8h, v1.16b + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + uxtl v16.8h, v7.8b + uxtl2 v17.8h, v7.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v4.8h, v4.8h, v8.h[0] + mul v5.8h, v5.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + mul v16.8h, v16.8h, v8.h[1] + mul v17.8h, v17.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v17.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + sshr v4.8h, v4.8h, #6 + sshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + add v4.8h, v4.8h, v24.8h + add v5.8h, v5.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h + sqxtun v1.8b, v4.8h + sqxtun2 v1.16b, v5.8h +.endif + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b + smull v18.8h, v19.8b, v26.8b + smull2 v19.8h, v19.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + smlal v18.8h, v23.8b, v25.8b + smlal2 v19.8h, v23.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + uaddw v18.8h, v18.8h, v7.8b + uaddw2 v19.8h, v19.8h, v7.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w9, w9, #1 +.if \oy + dup v25.16b, v28.b[0] + dup v26.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b}, [x5], x10 // grain_lut + + uaddlp v2.8h, v0.16b + uaddlp v3.8h, v1.16b +.if \csfl + rshrn v0.8b, v2.8h, #1 + rshrn2 v0.16b, v3.8h, #1 +.else + urshr v2.8h, v2.8h, #1 + urshr v3.8h, v3.8h, #1 + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h +.endif + + bl gather16_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + + umax v0.16b, v0.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S new file mode 100644 index 0000000000..7c4ff6dda9 --- /dev/null +++ b/third_party/dav1d/src/arm/64/filmgrain16.S @@ -0,0 +1,1997 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +function get_grain_4_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #2 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.h[7], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub x12, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + + ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid + ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right + + smull v4.4s, v17.4h, v28.4h + smlal v4.4s, v0.4h, v27.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v17.8h, v28.8h + smlal2 v5.4s, v0.8h, v27.8h + smlal2 v5.4s, v1.8h, v29.8h + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff + bl sum_\lag\()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH*2 + ld1 {v22.8h, v23.8h}, [x19], #32 + ld1 {v24.8h, v25.8h}, [x12] + addp v22.8h, v22.8h, v23.8h + addp v23.8h, v24.8h, v25.8h + add v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.8h, v23.8h}, [x19], #32 + addp v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.8h}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.8b, \uv_coeff + sxtl v1.8h, v1.8b + smlal v4.4s, v0.4h, v1.4h + smlal2 v5.4s, v0.8h, v1.8h +.else + smlal v4.4s, v0.4h, v30.4h + smlal2 v5.4s, v0.8h, v30.8h +.endif +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.h[5] +.endif +.ifnc \lag, lag1 + smov w16, v0.h[6] +.endif + smov w14, v0.h[7] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b +.ifc \edge, right + mov w15, #3 + bl output_\lag\()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #2 +.else + mov w15, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #14 +.endif + st1 {v0.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + ld1 {v21.8h}, [x13] + + dup v26.8b, v30.b[0] + ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid + dup v27.8b, v30.b[1] + ext v23.16b, v16.16b, v17.16b, #14 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[3] + ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.8b, v30.b[4] + ext v1.16b, v17.16b, v18.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smull v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[5] + ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid + dup v27.16b, v30.b[6] + ext v23.16b, v19.16b, v20.16b, #14 + sxtl v26.8h, v26.8b + dup v28.16b, v30.b[8] + ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.16b, v30.b[9] + ext v1.16b, v20.16b, v21.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smlal v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smlal2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + sxtl v26.8h, v26.8b + sxtl v27.8h, v27.8b + + smlal v4.4s, v17.4h, v26.4h + smlal v4.4s, v20.4h, v27.4h + smlal2 v5.4s, v17.8h, v26.8h + smlal2 v5.4s, v20.8h, v27.8h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH*2 - 16 + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v15.8h}, [x11] // load top right + ld1 {v18.8h}, [x12] + ld1 {v21.8h}, [x13] + + dup v22.8b, v29.b[0] + ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid + dup v23.8b, v29.b[1] + ext v9.16b, v13.16b, v14.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[2] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[3] + ext v10.16b, v13.16b, v14.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[4] + ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[5] + ext v12.16b, v14.16b, v15.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[6] + ext v13.16b, v14.16b, v15.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smull v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v14.4h, v25.4h + smull2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v14.8h, v25.8h + + dup v22.8b, v29.b[7] + ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid + dup v23.8b, v29.b[8] + ext v9.16b, v16.16b, v17.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[9] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[10] + ext v10.16b, v16.16b, v17.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[11] + ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[12] + ext v12.16b, v17.16b, v18.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[13] + ext v13.16b, v17.16b, v18.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v17.4h, v25.4h + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v17.8h, v25.8h + + dup v22.8b, v29.b[14] + ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid + dup v23.8b, v29.b[15] + ext v9.16b, v19.16b, v20.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v30.b[0] + sxtl v23.8h, v23.8b + dup v25.8b, v30.b[1] + ext v10.16b, v19.16b, v20.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v30.b[2] + ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v30.b[3] + ext v12.16b, v20.16b, v21.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[4] + ext v13.16b, v20.16b, v21.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v20.4h, v25.4h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v20.8h, v25.8h + + mov v13.16b, v14.16b + mov v14.16b, v15.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH*2 + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v14.8h}, [x11] // load the previous block right above + ld1 {v17.8h}, [x12] + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #80 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_2 v0 + subs w1, w1, #1 + st1 {v0.s}[0], [x0], #4 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #40 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_4 v0 + subs w1, w1, #1 + st1 {v0.4h}, [x0] + add x0, x0, #GRAIN_WIDTH*2-80 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_444_lag0_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v4.8h}, [x19], #16 +gen_grain_uv_lag0_8_start: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h +gen_grain_uv_lag0_8_add: + and v4.16b, v4.16b, v1.16b + smull v2.4s, v4.4h, v27.4h + smull2 v3.4s, v4.8h, v27.8h + srshl v2.4s, v2.4s, v28.4s + srshl v3.4s, v3.4s, v28.4s + sqxtn v2.4h, v2.4s + sqxtn2 v2.8h, v3.4s + sqadd v2.8h, v2.8h, v0.8h + smin v2.8h, v2.8h, v25.8h + smax v2.8h, v2.8h, v26.8h + st1 {v2.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_420_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + add x12, x19, #GRAIN_WIDTH*2 + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + ld1 {v18.8h, v19.8h}, [x12] + addp v16.8h, v16.8h, v17.8h + addp v17.8h, v18.8h, v19.8h + add v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + addp v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add x12, x19, #GRAIN_WIDTH*2 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + ld1 {v18.4h, v19.4h}, [x12] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + addp v17.4h, v18.4h, v19.4h + add v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #2 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #1 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 +.else + clz w15, w2 +.endif + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #2 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + mov v1.16b, v29.16b + bl gen_grain_uv_444_lag0_neon // 8 + movi v1.16b, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + mov v1.16b, v30.16b + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 + add x19, x19, #4 + st1 {v16.s}[0], [x0], #4 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.8b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b +.ifc \type, uv_444 + sxtl v30.8h, v30.8b +.endif + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #(3*GRAIN_WIDTH-3)*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 + + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #14 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + mov v1.16b, v29.16b + bl gen_grain_\type\()_lag0_8_neon // 8 + movi v1.16b, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + mov v1.16b, v30.16b + bl gen_grain_\type\()_lag0_4_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + sxtl v30.8h, v30.8b + set_height w1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0] + umov w15, \src2[1] + umov w16, \src1[2] + add x14, x14, x3 + umov w17, \src2[3] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4] + add x16, x16, x3 + ld1 {\dst2}[1+\off], [x15] + umov w15, \src2[5] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6] + add x14, x14, x3 + ld1 {\dst2}[3+\off], [x17] + umov w17, \src2[7] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[5+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[7+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2, src3, src4 + gather_interleaved \dst1, \dst2, \src1, \src3, 0 + gather_interleaved \dst2, \dst1, \src3, \src1, 0 + gather_interleaved \dst1, \dst2, \src2, \src4, 8 + gather_interleaved \dst2, \dst1, \src4, \src2, 8 +.endm + +function gather32_neon + gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h + ret +endfunc + +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w10, [sp, #96] // bitdepth_max + ldr w6, [x6] // offsets[0][0] + dup v26.8h, w10 // bitdepth_max + clz w10, w10 + ldr w8, [sp, #80] // clip + sub w10, w10, #24 // -bitdepth_min_8 + mov x9, #GRAIN_WIDTH*2 // grain_lut stride + neg w10, w10 // bitdepth_min_8 + + dup v29.8h, w4 // 15 - scaling_shift + dup v27.8h, w10 // bitdepth_min_8 + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #235 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v26.16b // bitdepth_max +2: + + ushr v26.8h, v26.8h, #1 // grain_max + not v25.16b, v26.16b // grain_min + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + + add x5, x5, #18 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #88] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v8.8h, v27.h[0] + dup v9.8h, v27.h[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src +.if \ox + ld1 {v20.4h}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v14.4h}, [x8], x9 // grain_lut top old +.endif + mvni v4.8h, #0xf0, lsl #8 // 0x0fff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + and v2.16b, v2.16b, v4.16b + and v3.16b, v3.16b, v4.16b + bl gather32_neon + +.if \ox + smull v20.4s, v20.4h, v27.4h + smlal v20.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v14.4s, v14.4h, v27.4h + smlal v14.4s, v21.4h, v28.4h + sqrshrn v20.4h, v20.4s, #5 + sqrshrn v14.4h, v14.4s, #5 + smin v20.4h, v20.4h, v26.4h + smin v14.4h, v14.4h, v26.4h + smax v20.4h, v20.4h, v25.4h + smax v14.4h, v14.4h, v25.4h +.endif + +.if \ox + smull v10.4s, v20.4h, v9.4h +.else + smull v10.4s, v16.4h, v9.4h +.endif + smull2 v11.4s, v16.8h, v9.8h + smull v12.4s, v17.4h, v9.4h + smull2 v13.4s, v17.8h, v9.8h + smull v16.4s, v18.4h, v9.4h + smull2 v17.4s, v18.8h, v9.8h + smull v18.4s, v19.4h, v9.4h + smull2 v19.4s, v19.8h, v9.8h +.if \ox + smlal v10.4s, v14.4h, v8.4h +.else + smlal v10.4s, v21.4h, v8.4h +.endif + smlal2 v11.4s, v21.8h, v8.8h + smlal v12.4s, v22.4h, v8.4h + smlal2 v13.4s, v22.8h, v8.8h + smlal v16.4s, v23.4h, v8.4h + smlal2 v17.4s, v23.8h, v8.8h + smlal v18.4s, v24.4h, v8.4h + smlal2 v19.4s, v24.8h, v8.8h + sqrshrn v10.4h, v10.4s, #5 + sqrshrn2 v10.8h, v11.4s, #5 + sqrshrn v11.4h, v12.4s, #5 + sqrshrn2 v11.8h, v13.4s, #5 + sqrshrn v12.4h, v16.4s, #5 + sqrshrn2 v12.8h, v17.4s, #5 + sqrshrn v13.4h, v18.4s, #5 + sqrshrn2 v13.8h, v19.4s, #5 + smin v16.8h, v10.8h, v26.8h + smin v17.8h, v11.8h, v26.8h + smin v18.8h, v12.8h, v26.8h + smin v19.8h, v13.8h, v26.8h + smax v16.8h, v16.8h, v25.8h + smax v17.8h, v17.8h, v25.8h + smax v18.8h, v18.8h, v25.8h + smax v19.8h, v19.8h, v25.8h +.endif + + uxtl v4.8h, v6.8b // scaling +.if \ox && !\oy + sqrshrn v20.4h, v20.4s, #5 +.endif + uxtl2 v5.8h, v6.16b +.if \ox && !\oy + smin v20.4h, v20.4h, v26.4h +.endif + uxtl v6.8h, v7.8b +.if \ox && !\oy + smax v20.4h, v20.4h, v25.4h +.endif + uxtl2 v7.8h, v7.16b +.if \ox && !\oy + ins v16.d[0], v20.d[0] +.endif + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h + + usqadd v0.8h, v20.8h // *src + noise + usqadd v1.8h, v21.8h + usqadd v2.8h, v22.8h + usqadd v3.8h, v23.8h + + umax v0.8h, v0.8h, v30.8h + umax v1.8h, v1.8h, v30.8h + umax v2.8h, v2.8h, v30.8h + umax v3.8h, v3.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w7, w7, #1 +.if \oy + dup v8.8h, v28.h[0] + dup v9.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr d14, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S new file mode 100644 index 0000000000..dab67577e6 --- /dev/null +++ b/third_party/dav1d/src/arm/64/ipred.S @@ -0,0 +1,3985 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + movi v0.16b, #128 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + movi v1.16b, #128 +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + movi v1.16b, #128 + movi v2.16b, #128 + movi v3.16b, #128 +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 16b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #1 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x2] +4: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2] +8: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2] +16: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b}, [x2] +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #4 + sub x5, x5, w3, uxtw + mov x7, #-4 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v1.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + st1 {v3.8b}, [x0], x1 + st1 {v2.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #1 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.8b, v0.b[0] +4: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.8b, v0.b[0] +8: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] +16: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v2.4h, v0.4h, v1.4h + rshrn v2.8b, v2.8h, #5 + dup v0.16b, v2.b[0] + dup v1.16b, v2.b[0] +32: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v4.4h, v0.4h, v1.4h + add v5.4h, v2.4h, v3.4h + add v4.4h, v4.4h, v5.4h + rshrn v4.8b, v4.8h, #6 + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] + dup v2.16b, v4.b[0] + dup v3.16b, v4.b[0] +64: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w4): + AARCH64_VALID_JUMP_TARGET + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + rshrn v0.8b, v0.8h, #3 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w8): + AARCH64_VALID_JUMP_TARGET + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w16): + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt L(ipred_dc_left_w16) + ret + +L(ipred_dc_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #5 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w32): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b +1: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v0.4h, v1.4h + add v2.4h, v2.4h, v3.4h + add v0.4h, v0.4h, v2.4h + rshrn v0.8b, v0.8h, #6 + dup v0.16b, v0.b[0] + br x3 +L(ipred_dc_left_w64): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.8h, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.8h, v16.8h, #1 // (width + height) >> 1 + dup v17.8h, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x2], #4 + ins v0.s[1], wzr + uaddlv h0, v0.8b + add x2, x2, #1 + br x3 +L(ipred_dc_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.s}[0], [x2] + ins v1.s[1], wzr + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.8b + cmp w4, #4 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x3334/2) + movk w16, #(0x5556/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8b, v0.b[0] +2: + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + subs w4, w4, #4 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2], #8 + uaddlv h0, v0.8b + add x2, x2, #1 + br x3 +L(ipred_dc_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.8b + cmp w4, #8 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8b, v0.b[0] +2: + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2], #16 + uaddlv h0, v0.16b + add x2, x2, #1 + br x3 +L(ipred_dc_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + cmp w4, #16 + add v0.4h, v0.4h, v1.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.16b, v0.b[0] +2: + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b}, [x2], #32 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add x2, x2, #1 + add v0.4h, v0.4h, v1.4h + br x3 +L(ipred_dc_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b, v2.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + uaddlv h2, v2.16b + cmp w4, #32 + add v0.4h, v0.4h, v1.4h + add v0.4h, v0.4h, v2.4h + ushl v4.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v4.4h, v4.4h, v16.4h +1: + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] +2: + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b}, [x0], x1 + st1 {v0.16b, v1.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v0.4h, v0.4h, v1.4h + add v2.4h, v2.4h, v3.4h + add x2, x2, #1 + add v0.4h, v0.4h, v2.4h + br x3 +L(ipred_dc_w64): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h1, v1.16b + uaddlv h2, v2.16b + uaddlv h3, v3.16b + uaddlv h4, v4.16b + add v1.4h, v1.4h, v2.4h + add v3.4h, v3.4h, v4.4h + cmp w4, #64 + add v0.4h, v0.4h, v1.4h + add v0.4h, v0.4h, v3.4h + ushl v4.4h, v0.4h, v17.4h + b.eq 1f + // h = 16/32 + mov w16, #(0x5556/2) + movk w16, #(0x3334/2), lsl #16 + lsr w16, w16, w4 + dup v16.4h, w16 + sqdmulh v4.4h, v4.4h, v16.4h +1: + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] + dup v2.16b, v4.b[0] + dup v3.16b, v4.b[0] +2: + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + subs w4, w4, #4 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x2] + add x8, x2, #1 + sub x2, x2, #4 + sub x5, x5, w9, uxtw + mov x7, #-4 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v5.4s}, [x8] + usubl v6.8h, v5.8b, v4.8b // top - topleft +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + zip1 v0.2s, v0.2s, v1.2s + zip1 v2.2s, v2.2s, v3.2s + uaddw v16.8h, v6.8h, v0.8b + uaddw v17.8h, v6.8h, v2.8b + sqxtun v16.8b, v16.8h // base + sqxtun2 v16.16b, v17.8h + zip1 v0.2d, v0.2d, v2.2d + uabd v20.16b, v5.16b, v16.16b // tdiff + uabd v22.16b, v4.16b, v16.16b // tldiff + uabd v16.16b, v0.16b, v16.16b // ldiff + umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) + cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff + cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff + bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... + st1 {v20.s}[3], [x0], x1 + st1 {v20.s}[2], [x6], x1 + subs w4, w4, #4 + st1 {v20.s}[1], [x0], x1 + st1 {v20.s}[0], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1r {v5.2d}, [x8] + usubl v6.8h, v5.8b, v4.8b // top - topleft +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 + uaddw v16.8h, v6.8h, v0.8b + uaddw v17.8h, v6.8h, v1.8b + uaddw v18.8h, v6.8h, v2.8b + uaddw v19.8h, v6.8h, v3.8b + sqxtun v16.8b, v16.8h // base + sqxtun2 v16.16b, v17.8h + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + zip1 v2.2d, v2.2d, v3.2d + zip1 v0.2d, v0.2d, v1.2d + uabd v21.16b, v5.16b, v18.16b // tdiff + uabd v20.16b, v5.16b, v16.16b + uabd v23.16b, v4.16b, v18.16b // tldiff + uabd v22.16b, v4.16b, v16.16b + uabd v17.16b, v2.16b, v18.16b // ldiff + uabd v16.16b, v0.16b, v16.16b + umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) + umin v18.16b, v20.16b, v22.16b + cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff + cmhs v20.16b, v22.16b, v20.16b + cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff + cmhs v16.16b, v18.16b, v16.16b + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v5.16b}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw +1: + ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 +2: + usubl v6.8h, v5.8b, v4.8b // top - topleft + usubl2 v7.8h, v5.16b, v4.16b + uaddw v24.8h, v6.8h, v0.8b + uaddw v25.8h, v7.8h, v0.8b + uaddw v26.8h, v6.8h, v1.8b + uaddw v27.8h, v7.8h, v1.8b + uaddw v28.8h, v6.8h, v2.8b + uaddw v29.8h, v7.8h, v2.8b + uaddw v30.8h, v6.8h, v3.8b + uaddw v31.8h, v7.8h, v3.8b + sqxtun v17.8b, v26.8h // base + sqxtun2 v17.16b, v27.8h + sqxtun v16.8b, v24.8h + sqxtun2 v16.16b, v25.8h + sqxtun v19.8b, v30.8h + sqxtun2 v19.16b, v31.8h + sqxtun v18.8b, v28.8h + sqxtun2 v18.16b, v29.8h + uabd v23.16b, v5.16b, v19.16b // tdiff + uabd v22.16b, v5.16b, v18.16b + uabd v21.16b, v5.16b, v17.16b + uabd v20.16b, v5.16b, v16.16b + uabd v27.16b, v4.16b, v19.16b // tldiff + uabd v26.16b, v4.16b, v18.16b + uabd v25.16b, v4.16b, v17.16b + uabd v24.16b, v4.16b, v16.16b + uabd v19.16b, v3.16b, v19.16b // ldiff + uabd v18.16b, v2.16b, v18.16b + uabd v17.16b, v1.16b, v17.16b + uabd v16.16b, v0.16b, v16.16b + umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) + umin v30.16b, v22.16b, v26.16b + umin v29.16b, v21.16b, v25.16b + umin v28.16b, v20.16b, v24.16b + cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff + cmhs v22.16b, v26.16b, v22.16b + cmhs v21.16b, v25.16b, v21.16b + cmhs v20.16b, v24.16b, v20.16b + cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff + cmhs v18.16b, v30.16b, v18.16b + cmhs v17.16b, v29.16b, v17.16b + cmhs v16.16b, v28.16b, v16.16b + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + subs w3, w3, #16 + st1 {v23.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v21.16b}, [x5], #16 + st1 {v20.16b}, [x10], #16 + b.le 8f + ld1 {v5.16b}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.16b}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x12] // bottom + add x8, x2, #1 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2s}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + dup v5.16b, v6.b[3] // right + usubl v6.8h, v6.8b, v4.8b // top-bottom + uxtl v7.8h, v7.8b // weights_hor +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + zip1 v1.2s, v1.2s, v0.2s // left, flipped + zip1 v0.2s, v3.2s, v2.2s + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + shll v22.8h, v4.8b, #8 // bottom*256 + shll v23.8h, v4.8b, #8 + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h + mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v23.8h, v6.8h, v18.8h + uhadd v20.8h, v20.8h, v22.8h + uhadd v21.8h, v21.8h, v23.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8b}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + dup v5.16b, v6.b[7] // right + usubl v6.8h, v6.8b, v4.8b // top-bottom + uxtl v7.8h, v7.8b // weights_hor +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + usubl v2.8h, v2.8b, v5.8b + usubl v3.8h, v3.8b, v5.8b + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v2.8h, v7.8h // (left flipped) + mla v22.8h, v1.8h, v7.8h + mla v23.8h, v0.8h, v7.8h + mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v6.8h, v17.8h + mla v26.8h, v6.8h, v18.8h + mla v27.8h, v6.8h, v19.8h + uhadd v20.8h, v20.8h, v24.8h + uhadd v21.8h, v21.8h, v25.8h + uhadd v22.8h, v22.8h, v26.8h + uhadd v23.8h, v23.8h, v27.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v20.8b}, [x0], x1 + st1 {v21.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + add x12, x2, w3, uxtw + sub x2, x2, #2 + mov x7, #-2 + ld1r {v5.16b}, [x12] // right + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld2r {v0.8b, v1.8b}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v3.16b}, [x8], #16 // top + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + usubl v2.8h, v3.8b, v4.8b // top-bottom + usubl2 v3.8h, v3.16b, v4.16b + mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h // (left flipped) + mla v22.8h, v0.8h, v6.8h + mla v23.8h, v0.8h, v7.8h + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v3.8h, v16.8h + mla v26.8h, v2.8h, v17.8h + mla v27.8h, v3.8h, v17.8h + uhadd v20.8h, v20.8h, v24.8h + uhadd v21.8h, v21.8h, v25.8h + uhadd v22.8h, v22.8h, v26.8h + uhadd v23.8h, v23.8h, v27.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.16b}, [x8] // bottom + add x2, x2, #1 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2s}, [x2] // top + usubl v6.8h, v6.8b, v4.8b // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + shll v22.8h, v4.8b, #8 // bottom*256 + shll v23.8h, v4.8b, #8 + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v23.8h, v6.8h, v18.8h + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v22.s}[0], [x0], x1 + st1 {v22.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v23.s}[0], [x0], x1 + st1 {v23.s}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8b}, [x2] // top + usubl v6.8h, v6.8b, v4.8b // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + shll v24.8h, v4.8b, #8 // bottom*256 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v25.8h, v6.8h, v17.8h + mla v26.8h, v6.8h, v18.8h + mla v27.8h, v6.8h, v19.8h + rshrn v24.8b, v24.8h, #8 + rshrn v25.8b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn v27.8b, v27.8h, #8 + st1 {v24.8b}, [x0], x1 + st1 {v25.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v26.8b}, [x0], x1 + st1 {v27.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b +2: + ld1 {v3.16b}, [x2], #16 // top + shll v20.8h, v4.8b, #8 // bottom*256 + shll v21.8h, v4.8b, #8 + shll v22.8h, v4.8b, #8 + shll v23.8h, v4.8b, #8 + shll v24.8h, v4.8b, #8 + shll v25.8h, v4.8b, #8 + shll v26.8h, v4.8b, #8 + shll v27.8h, v4.8b, #8 + usubl v2.8h, v3.8b, v4.8b // top-bottom + usubl2 v3.8h, v3.16b, v4.16b + mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver + mla v21.8h, v3.8h, v16.8h + mla v22.8h, v2.8h, v17.8h + mla v23.8h, v3.8h, v17.8h + mla v24.8h, v2.8h, v18.8h + mla v25.8h, v3.8h, v18.8h + mla v26.8h, v2.8h, v19.8h + mla v27.8h, v3.8h, v19.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + rshrn v24.8b, v24.8h, #8 + rshrn2 v24.16b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn2 v26.16b, v27.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v24.16b}, [x5], #16 + st1 {v26.16b}, [x8], #16 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.16b}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + uxtl v7.8h, v7.8b // weights_hor +4: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + zip1 v1.2s, v1.2s, v0.2s // left, flipped + zip1 v0.2s, v3.2s, v2.2s + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v1.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #4 + mov x7, #-4 + uxtl v7.8h, v7.8b // weights_hor +8: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + usubl v3.8h, v3.8b, v5.8b // left-right + usubl v2.8h, v2.8b, v5.8b + usubl v1.8h, v1.8b, v5.8b + usubl v0.8h, v0.8b, v5.8b + mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v2.8h, v7.8h // (left flipped) + mla v22.8h, v1.8h, v7.8h + mla v23.8h, v0.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn v21.8b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn v23.8b, v23.8h, #8 + st1 {v20.8b}, [x0], x1 + st1 {v21.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + sub x2, x2, #4 + mov x7, #-4 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left + usubl v0.8h, v0.8b, v5.8b // left-right + usubl v1.8h, v1.8b, v5.8b + usubl v2.8h, v2.8b, v5.8b + usubl v3.8h, v3.8b, v5.8b +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + shll v20.8h, v5.8b, #8 // right*256 + shll v21.8h, v5.8b, #8 + shll v22.8h, v5.8b, #8 + shll v23.8h, v5.8b, #8 + shll v24.8h, v5.8b, #8 + shll v25.8h, v5.8b, #8 + shll v26.8h, v5.8b, #8 + shll v27.8h, v5.8b, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor + mla v21.8h, v3.8h, v7.8h // (left flipped) + mla v22.8h, v2.8h, v6.8h + mla v23.8h, v2.8h, v7.8h + mla v24.8h, v1.8h, v6.8h + mla v25.8h, v1.8h, v7.8h + mla v26.8h, v0.8h, v6.8h + mla v27.8h, v0.8h, v7.8h + rshrn v20.8b, v20.8h, #8 + rshrn2 v20.16b, v21.8h, #8 + rshrn v22.8b, v22.8h, #8 + rshrn2 v22.16b, v23.8h, #8 + rshrn v24.8b, v24.8h, #8 + rshrn2 v24.16b, v25.8h, #8 + rshrn v26.8b, v26.8h, #8 + rshrn2 v26.16b, v27.8h, #8 + subs w3, w3, #16 + st1 {v20.16b}, [x0], #16 + st1 {v22.16b}, [x6], #16 + st1 {v24.16b}, [x5], #16 + st1 {v26.16b}, [x10], #16 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end); +function ipred_z1_upsample_edge_8bpc_neon, export=1 + movrel x4, padding_mask + ld1 {v0.16b}, [x2] // in[] + add x5, x2, w3, uxtw // in[end] + sub x4, x4, w3, uxtw + + ld1r {v1.16b}, [x5] // padding + ld1 {v3.16b}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v1.16b, v3.16b // padded in[] + + ext v4.16b, v0.16b, v1.16b, #1 + ext v5.16b, v0.16b, v1.16b, #2 + ext v6.16b, v0.16b, v1.16b, #3 + + uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] + uaddl2 v17.8h, v4.16b, v5.16b + uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] + uaddl2 v19.8h, v0.16b, v6.16b + mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) + mul v17.8h, v17.8h, v31.8h + sub v16.8h, v16.8h, v18.8h + sub v17.8h, v17.8h, v19.8h + + sqrshrun v16.8b, v16.8h, #4 + sqrshrun2 v16.16b, v17.8h, #4 + + zip1 v0.16b, v4.16b, v16.16b + zip2 v1.16b, v4.16b, v16.16b + + st1 {v0.16b, v1.16b}, [x0] + + ret +endfunc + +const edge_filter + .byte 0, 4, 8, 0 + .byte 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_8bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -3 + add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 + + ld1 {v31.h}[0], [x5] // kernel[1-2] + + ld1 {v0.16b}, [x2], #16 + + dup v30.16b, v31.b[0] + dup v31.16b, v31.b[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.16b}, [x2], #16 + b.lt 2f + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + umull v4.8h, v0.8b, v30.8b + umlal v4.8h, v2.8b, v31.8b + umlal v4.8h, v3.8b, v30.8b + umull2 v5.8h, v0.16b, v30.16b + umlal2 v5.8h, v2.16b, v31.16b + umlal2 v5.8h, v3.16b, v30.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + rshrn v4.8b, v4.8h, #4 + rshrn2 v4.16b, v5.8h, #4 + sub w3, w3, #16 + st1 {v4.16b}, [x0], #16 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) + movrel x5, padding_mask + sub w6, w3, #32 + sub x5, x5, w3, uxtw + add x6, x2, w6, sxtw + + ld1 {v2.16b}, [x5] // padding_mask + + ld1r {v1.16b}, [x6] + bit v0.16b, v1.16b, v2.16b // Pad v0-v1 + + // Filter one block + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + umull v4.8h, v0.8b, v30.8b + umlal v4.8h, v2.8b, v31.8b + umlal v4.8h, v3.8b, v30.8b + umull2 v5.8h, v0.16b, v30.16b + umlal2 v5.8h, v2.16b, v31.16b + umlal2 v5.8h, v3.16b, v30.16b + subs w1, w1, #16 + rshrn v4.8b, v4.8h, #4 + rshrn2 v4.16b, v5.8h, #4 + st1 {v4.16b}, [x0], #16 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v1.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #1 // topleft -= 1 + movi v29.16b, #2 + ld1 {v0.16b}, [x2], #16 + movi v30.16b, #4 + movi v31.16b, #4 + ins v0.b[0], v0.b[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.16b}, [x2], #16 + b.lt 2f // if (end + 1 < 19) + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v0.16b, v1.16b, #3 + ext v5.16b, v0.16b, v1.16b, #4 + umull v6.8h, v0.8b, v29.8b + umlal v6.8h, v2.8b, v30.8b + umlal v6.8h, v3.8b, v31.8b + umlal v6.8h, v4.8b, v30.8b + umlal v6.8h, v5.8b, v29.8b + umull2 v7.8h, v0.16b, v29.16b + umlal2 v7.8h, v2.16b, v30.16b + umlal2 v7.8h, v3.16b, v31.16b + umlal2 v7.8h, v4.16b, v30.16b + umlal2 v7.8h, v5.16b, v29.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + rshrn v6.8b, v6.8h, #4 + rshrn2 v6.16b, v7.8h, #4 + sub w3, w3, #16 + st1 {v6.16b}, [x0], #16 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) + movrel x5, padding_mask, -1 + sub w6, w3, #31 + sub x5, x5, w3, uxtw + add x6, x2, w6, sxtw + + ld1 {v2.16b, v3.16b}, [x5] // padding_mask + + ld1r {v28.16b}, [x6] + bit v0.16b, v28.16b, v2.16b // Pad v0-v1 + bit v1.16b, v28.16b, v3.16b +4: + // Filter one block + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v0.16b, v1.16b, #3 + ext v5.16b, v0.16b, v1.16b, #4 + umull v6.8h, v0.8b, v29.8b + umlal v6.8h, v2.8b, v30.8b + umlal v6.8h, v3.8b, v31.8b + umlal v6.8h, v4.8b, v30.8b + umlal v6.8h, v5.8b, v29.8b + umull2 v7.8h, v0.16b, v29.16b + umlal2 v7.8h, v2.16b, v30.16b + umlal2 v7.8h, v3.16b, v31.16b + umlal2 v7.8h, v4.16b, v30.16b + umlal2 v7.8h, v5.16b, v29.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + mov v1.16b, v28.16b + rshrn v6.8b, v6.8h, #4 + rshrn2 v6.16b, v7.8h, #4 + sub w3, w3, #16 + st1 {v6.16b}, [x0], #16 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #16 + st1 {v1.16b}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_8bpc_neon, export=1 + dup v0.16b, w1 +1: + subs w2, w2, #16 + st1 {v0.16b}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_8bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + ldr d0, [x2, w8, uxtw] // top[base] + ldr d2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] + ext v3.8b, v2.8b, v2.8b, #1 + usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] + usubl v7.8h, v3.8b, v2.8b + ushll v16.8h, v0.8b, #6 // top[base]*64 + ushll v17.8h, v2.8b, #6 + mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac + mla v17.4h, v7.4h, v5.4h + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.s}[0], [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.s}[0], [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v31.s}[0], [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.8b, w9 // frac + dup v5.8b, w11 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8b, w9 // 64 - frac + dup v7.8b, w11 + ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #1 + umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac + umull v17.8h, v2.8b, v7.8b + umlal v17.8h, v3.8b, v5.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8b}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw + add x10, x2, w10, uxtw + dup v4.16b, w9 // frac + dup v5.16b, w11 + ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] + ld1 {v2.16b, v3.16b}, [x10], #32 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.16b, w9 // 64 - frac + dup v7.16b, w11 + add w7, w7, w5 // xpos += dx +2: + ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] + ext v17.16b, v2.16b, v3.16b, #1 + subs w3, w3, #16 + umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac + umull2 v19.8h, v0.16b, v6.16b + umlal2 v19.8h, v16.16b, v4.16b + umull v20.8h, v2.8b, v7.8b + umlal v20.8h, v17.8b, v5.8b + umull2 v21.8h, v2.16b, v7.16b + umlal2 v21.8h, v17.16b, v5.16b + rshrn v16.8b, v18.8h, #6 + rshrn2 v16.16b, v19.8h, #6 + rshrn v17.8b, v20.8h, #6 + rshrn2 v17.16b, v21.8h, #6 + st1 {v16.16b}, [x0], #16 + st1 {v17.16b}, [x13], #16 + b.le 3f + mov v0.16b, v1.16b + ld1 {v1.16b}, [x8], #16 // top[base] + mov v2.16b, v3.16b + ld1 {v3.16b}, [x10], #16 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.16b}, [x0], #16 + subs w3, w3, #16 + st1 {v31.16b}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_8bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + ldr d0, [x2, w8, uxtw] // top[base] + ldr d2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8b, v0.8b, v0.8b // top[base+1] + uzp1 v0.8b, v0.8b, v0.8b // top[base] + uzp2 v3.8b, v2.8b, v2.8b + uzp1 v2.8b, v2.8b, v2.8b + usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] + usubl v7.8h, v3.8b, v2.8b + ushll v16.8h, v0.8b, #6 // top[base]*64 + ushll v17.8h, v2.8b, #6 + mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac + mla v17.4h, v7.4h, v5.4h + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.s}[0], [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.s}[0], [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v31.s}[0], [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.8b, w9 // frac + dup v5.8b, w11 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8b, w9 // 64 - frac + dup v7.8b, w11 + uzp2 v1.16b, v0.16b, v0.16b // top[base+1] + uzp1 v0.16b, v0.16b, v0.16b // top[base] + uzp2 v3.16b, v2.16b, v2.16b + uzp1 v2.16b, v2.16b, v2.16b + umull v16.8h, v1.8b, v4.8b // top[base+1]*frac + umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) + umull v17.8h, v3.8b, v5.8b + umlal v17.8h, v2.8b, v7.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8b}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_8bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.16b}, [x1] + subs w2, w2, #16 + rev64 v0.16b, v0.16b + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +const increments + .short 0, 1, 2, 3, 4, 5, 6, 7 +endconst + +// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_8bpc_neon, export=1 + cmp w6, #64 + clz w9, w3 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw // left[max_base_y] + sub x8, x8, w9, uxtw + movrel x11, increments + ld1r {v31.16b}, [x10] // padding + ld1 {v30.8h}, [x11] // increments + mov w7, w5 + b.gt L(ipred_z3_fill1_large_h16) + br x8 + +40: + AARCH64_VALID_JUMP_TARGET + dup v29.4h, w5 // dy + + mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.4h, v29.4h, v30.4h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + mov v4.8b, v31.8b + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + + tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] + + trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 + trn1 v24.2s, v24.2s, v24.2s // frac + trn1 v25.2s, v25.2s, v25.2s // 64 - frac +1: + mov v5.8b, v31.8b + tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] + + trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + rshrn v16.8b, v16.8h, #6 + st1 {v16.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + ext v4.8b, v5.8b, v5.8b, #4 + uqadd v27.8b, v27.8b, v21.8b // base += 2 + b 1b + +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET + dup v29.8h, w5 // dy + + mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 + ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] + add v30.8h, v29.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + mov v4.8b, v31.8b + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + + tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] +1: + mov v5.8b, v31.8b + mov v6.8b, v31.8b + tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] + tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull v17.8h, v5.8b, v25.8b + umlal v17.8h, v6.8b, v24.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.le 9f + + mov v4.8b, v6.8b + uqadd v27.8b, v27.8b, v21.8b // base += 2 + uqadd v28.8b, v28.8b, v21.8b // base += 2 + b 1b + +9: + ret + +160: + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w5 // dy + + shl v29.8h, v28.8h, #3 // 8*dy + mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // This is only executed if we've checked that max_base_y <= 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] + add v28.8h, v28.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + add v29.8h, v28.8h, v29.8h // ypos + 8*dy + + xtn v24.8b, v28.8h // (uint8_t)ypos + xtn2 v24.16b, v29.8h + uqshrn v26.8b, v28.8h, #6 // base + uqshrn2 v26.16b, v29.8h, #6 + and v24.16b, v24.16b, v23.16b // frac + + mov v4.16b, v31.16b + uqadd v27.16b, v26.16b, v20.16b // base + 1 + uqadd v28.16b, v26.16b, v21.16b // base + 2 + sub v25.16b, v22.16b, v24.16b // 64 - frac + + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] +1: + mov v5.16b, v31.16b + mov v6.16b, v31.16b + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] + tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + umull v18.8h, v5.8b, v25.8b + umlal v18.8h, v6.8b, v24.8b + umull2 v19.8h, v5.16b, v25.16b + umlal2 v19.8h, v6.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn2 v16.16b, v17.8h, #6 + rshrn v17.8b, v18.8h, #6 + rshrn2 v17.16b, v19.8h, #6 + st1 {v16.16b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.16b}, [x0], x1 + b.le 9f + + mov v4.16b, v6.16b + uqadd v27.16b, v27.16b, v21.16b // base += 2 + uqadd v28.16b, v28.16b, v21.16b // base += 2 + b 1b + +9: + ret +320: +640: + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w5 // dy + mov w12, w3 + + add x13, x0, x1 + + shl v29.8h, v28.8h, #3 // 8*dy + mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + add v30.8h, v28.8h, v30.8h // ypos + + // This is only executed if we've checked that max_base_y <= 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + +1: + mov v26.16b, v30.16b // reset ypos + +2: + add v27.8h, v26.8h, v29.8h // ypos + 8*dy + uqshrn v16.8b, v26.8h, #6 // base + uqshrn2 v16.16b, v27.8h, #6 + xtn v24.8b, v26.8h // (uint8_t)ypos + xtn2 v24.16b, v27.8h + umov w14, v16.b[0] + and v24.16b, v24.16b, v23.16b // frac + + uqadd v17.16b, v16.16b, v20.16b // base + 1 + cmp w14, w6 // base >= max_base_y + uqadd v18.16b, v16.16b, v21.16b // base + 2 + sub v25.16b, v22.16b, v24.16b // 64 - frac + + b.ge 4f + + mov v4.16b, v31.16b + mov v5.16b, v31.16b + mov v6.16b, v31.16b + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] + tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] + + subs w3, w3, #16 + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + umull v18.8h, v5.8b, v25.8b + umlal v18.8h, v6.8b, v24.8b + umull2 v19.8h, v5.16b, v25.16b + umlal2 v19.8h, v6.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn2 v16.16b, v17.8h, #6 + rshrn v17.8b, v18.8h, #6 + rshrn2 v17.16b, v19.8h, #6 + st1 {v16.16b}, [x0], #16 + st1 {v17.16b}, [x13], #16 + b.le 3f + add v26.8h, v27.8h, v29.8h // ypos += 16*dy + b 2b + +3: + subs w4, w4, #2 + b.le 9f + movi v16.8h, #128 + add x0, x0, x1 + add x13, x13, x1 + add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 + mov w3, w12 + b 1b + +4: + subs w3, w3, #16 + st1 {v31.16b}, [x0], #16 + st1 {v31.16b}, [x13], #16 + b.gt 4b + b 3b + +9: + ret + +L(ipred_z3_fill1_large_h16): + // Fallback case for max_base_y > 64; similar to the z1 + // implementation. This does the filtering vertically, filling out + // a 2x pixel column at a time. + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw + add x10, x2, w10, uxtw + dup v4.16b, w9 // frac + dup v5.16b, w11 + ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] + ld1 {v2.16b, v3.16b}, [x10], #32 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.16b, w9 // 64 - frac + dup v7.16b, w11 + add w7, w7, w5 // ypos += dy +2: + ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] + ext v17.16b, v2.16b, v3.16b, #1 + subs w4, w4, #16 + umull v18.8h, v16.8b, v4.8b // left[base+1]*frac + umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) + umull2 v19.8h, v16.16b, v4.16b + umlal2 v19.8h, v0.16b, v6.16b + umull v20.8h, v17.8b, v5.8b + umlal v20.8h, v2.8b, v7.8b + umull2 v21.8h, v17.16b, v5.16b + umlal2 v21.8h, v2.16b, v7.16b + rshrn v16.8b, v18.8h, #6 + rshrn2 v16.16b, v19.8h, #6 + rshrn v17.8b, v20.8h, #6 + rshrn2 v17.16b, v21.8h, #6 + zip1 v18.16b, v16.16b, v17.16b + zip2 v19.16b, v16.16b, v17.16b + st1 {v18.h}[0], [x0], x1 + st1 {v18.h}[1], [x13], x1 + st1 {v18.h}[2], [x0], x1 + st1 {v18.h}[3], [x13], x1 + st1 {v18.h}[4], [x0], x1 + st1 {v18.h}[5], [x13], x1 + st1 {v18.h}[6], [x0], x1 + st1 {v18.h}[7], [x13], x1 + st1 {v19.h}[0], [x0], x1 + st1 {v19.h}[1], [x13], x1 + st1 {v19.h}[2], [x0], x1 + st1 {v19.h}[3], [x13], x1 + st1 {v19.h}[4], [x0], x1 + st1 {v19.h}[5], [x13], x1 + st1 {v19.h}[6], [x0], x1 + st1 {v19.h}[7], [x13], x1 + b.le 3f + mov v0.16b, v1.16b + ld1 {v1.16b}, [x8], #16 // left[base] + mov v2.16b, v3.16b + ld1 {v3.16b}, [x10], #16 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #2 + add x13, x13, #2 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #16 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + st1 {v31.h}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.h}[0], [x13], x1 + st1 {v31.h}[0], [x0], x1 + st1 {v31.h}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #2 + add x13, x13, #2 + mov w4, w12 + b 1b + +4: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +8: + st1 {v31.8b}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8b}, [x13], x1 + st1 {v31.8b}, [x0], x1 + st1 {v31.8b}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +16: +32: +64: + st1 {v31.16b}, [x0], x1 + subs w4, w4, #4 + st1 {v31.16b}, [x13], x1 + st1 {v31.16b}, [x0], x1 + st1 {v31.16b}, [x13], x1 + b.gt 4b + subs w3, w3, #16 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 16. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw +1: + ands w5, w3, #15 + b.eq 2f + // If the width isn't aligned to 16, first do one 16 byte write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.16b}, [x0] + add x0, x0, w5, uxtw +2: + // Fill the rest of the line with aligned 16 byte writes. + subs w3, w3, #16 + st1 {v31.16b}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_8bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // left[max_base_y] + movrel x11, increments + ld1r {v31.16b}, [x10] // padding + ld1 {v30.8h}, [x11] // increments + b.eq 80f + +40: // w == 4 + dup v29.4h, w5 // dy + + mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.4h, v29.4h, v30.4h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + uqadd v29.8b, v27.8b, v21.8b // base + 3 + + trn1 v24.2s, v24.2s, v24.2s // frac + trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 + trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 + trn1 v25.2s, v25.2s, v25.2s // 64 - frac + + movi v21.16b, #4 +1: + mov v4.8b, v31.8b + mov v5.8b, v31.8b + tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] + tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + rshrn v16.8b, v16.8h, #6 + st1 {v16.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + uqadd v26.8b, v26.8b, v21.8b // base += 4 + uqadd v27.8b, v27.8b, v21.8b // base += 4 + b 1b + +9: + ret + +80: // w == 8 + dup v29.8h, w5 // dy + + mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.8h, v29.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + uqadd v29.8b, v27.8b, v21.8b // base + 3 + + trn1 v24.2d, v24.2d, v24.2d // frac + trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 + trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 + trn1 v25.2d, v25.2d, v25.2d // 64 - frac + + movi v21.16b, #4 +1: + mov v4.16b, v31.16b + mov v5.16b, v31.16b + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.le 9f + + uqadd v26.16b, v26.16b, v21.16b // base += 4 + uqadd v27.16b, v27.16b, v21.16b // base += 4 + b 1b + +9: + ret +endfunc + + +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ldur s0, [x2, #1] // top (0-3) + sub x2, x2, #2 + mov x7, #-2 + uxtl v0.8h, v0.8b // top (0-3) +4: + ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + uxtl v1.8h, v1.8b // left (0-1) + topleft (2) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.8b, v2.8h, #4 + subs w4, w4, #2 + st1 {v2.s}[0], [x0], x1 + uxtl v0.8h, v2.8b + st1 {v2.s}[1], [x6], x1 + ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ldur d0, [x2, #1] // top (0-7) + sub x2, x2, #2 + mov x7, #-2 + uxtl v0.8h, v0.8b // top (0-7) +8: + ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + uxtl v1.8h, v1.8b // left (0-1) + topleft (2) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.8b, v2.8h, #4 + uxtl v1.8h, v2.8b // first block, in 16 bit + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.8b, v3.8h, #4 + subs w4, w4, #2 + st2 {v2.s, v3.s}[0], [x0], x1 + zip2 v0.2s, v2.2s, v3.2s + st2 {v2.s, v3.s}[1], [x6], x1 + uxtl v0.8h, v0.8b + b.gt 8b + ret +160: +320: + AARCH64_VALID_JUMP_TARGET + add x8, x2, #1 + sub x2, x2, #2 + mov x7, #-2 + sub x1, x1, w3, uxtw + mov w9, w3 + +1: + ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) + uxtl v0.8h, v0.8b // left (0-1) + topleft (2) +2: + ld1 {v2.16b}, [x8], #16 // top(0-15) + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + uxtl v1.8h, v2.8b // top(0-7) + uxtl2 v2.8h, v2.16b // top(8-15) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.8b, v3.8h, #4 + uxtl v0.8h, v3.8b // first block, in 16 bit + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.8b, v4.8h, #4 + uxtl v0.8h, v4.8b // second block, in 16 bit + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.8b, v5.8h, #4 + uxtl v0.8h, v5.8b // third block, in 16 bit + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.8b, v6.8h, #4 + + st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 + st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 + b.le 8f + ins v0.h[2], v2.h[7] + ins v0.b[0], v6.b[7] + ins v0.b[2], v6.b[3] + b 2b +8: + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter_tbl): + .hword L(ipred_filter_tbl) - 320b + .hword L(ipred_filter_tbl) - 160b + .hword L(ipred_filter_tbl) - 80b + .hword L(ipred_filter_tbl) - 40b +endfunc + +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 + ld1 {v0.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + xtn v0.8b, v0.8h + sub x6, x6, w9, uxtw + add x2, x0, x1 + lsl x1, x1, #1 + br x6 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x2], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[3], [x2], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b, v2.16b}, [x3], #32 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + st1 {v1.d}[0], [x0], x1 + tbl v2.16b, {v0.16b}, v2.16b + st1 {v1.d}[1], [x2], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x2], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 + subs w5, w5, #4 + tbl v1.16b, {v0.16b}, v1.16b + tbl v2.16b, {v0.16b}, v2.16b + st1 {v1.16b}, [x0], x1 + tbl v3.16b, {v0.16b}, v3.16b + st1 {v2.16b}, [x2], x1 + tbl v4.16b, {v0.16b}, v4.16b + st1 {v3.16b}, [x0], x1 + st1 {v4.16b}, [x2], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 + subs w5, w5, #4 + tbl v16.16b, {v0.16b}, v16.16b + tbl v17.16b, {v0.16b}, v17.16b + tbl v18.16b, {v0.16b}, v18.16b + tbl v19.16b, {v0.16b}, v19.16b + tbl v20.16b, {v0.16b}, v20.16b + st1 {v16.16b, v17.16b}, [x0], x1 + tbl v21.16b, {v0.16b}, v21.16b + st1 {v18.16b, v19.16b}, [x2], x1 + tbl v22.16b, {v0.16b}, v22.16b + st1 {v20.16b, v21.16b}, [x0], x1 + tbl v23.16b, {v0.16b}, v23.16b + st1 {v22.16b, v23.16b}, [x2], x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 + subs w5, w5, #2 + tbl v16.16b, {v0.16b}, v16.16b + tbl v17.16b, {v0.16b}, v17.16b + tbl v18.16b, {v0.16b}, v18.16b + tbl v19.16b, {v0.16b}, v19.16b + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 + tbl v20.16b, {v0.16b}, v20.16b + tbl v21.16b, {v0.16b}, v21.16b + tbl v22.16b, {v0.16b}, v22.16b + tbl v23.16b, {v0.16b}, v23.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 64b + .hword L(pal_pred_tbl) - 32b + .hword L(pal_pred_tbl) - 16b + .hword L(pal_pred_tbl) - 8b + .hword L(pal_pred_tbl) - 4b +endfunc + +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + movi v0.8h, #128 // dc + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 +L(ipred_cfl_splat_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x5], #32 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + cmlt v4.8h, v2.8h, #0 // sign + cmlt v5.8h, v3.8h, #0 + add v2.8h, v2.8h, v4.8h // diff + sign + add v3.8h, v3.8h, v5.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[1], [x6], x1 + subs w4, w4, #4 + st1 {v3.s}[0], [x0], x1 + st1 {v3.s}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + mul v4.8h, v4.8h, v1.8h + mul v5.8h, v5.8h, v1.8h + cmlt v16.8h, v2.8h, #0 // sign + cmlt v17.8h, v3.8h, #0 + cmlt v18.8h, v4.8h, #0 + cmlt v19.8h, v5.8h, #0 + add v2.8h, v2.8h, v16.8h // diff + sign + add v3.8h, v3.8h, v17.8h + add v4.8h, v4.8h, v18.8h + add v5.8h, v5.8h, v19.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + srshr v4.8h, v4.8h, #6 + srshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + add v4.8h, v4.8h, v0.8h + add v5.8h, v5.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x6], x1 + subs w4, w4, #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + AARCH64_VALID_JUMP_TARGET + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + mul v2.8h, v2.8h, v1.8h // diff = ac * alpha + mul v3.8h, v3.8h, v1.8h + mul v4.8h, v4.8h, v1.8h + mul v5.8h, v5.8h, v1.8h + cmlt v16.8h, v2.8h, #0 // sign + cmlt v17.8h, v3.8h, #0 + cmlt v18.8h, v4.8h, #0 + cmlt v19.8h, v5.8h, #0 + add v2.8h, v2.8h, v16.8h // diff + sign + add v3.8h, v3.8h, v17.8h + add v4.8h, v4.8h, v18.8h + add v5.8h, v5.8h, v19.8h + srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() + srshr v3.8h, v3.8h, #6 + srshr v4.8h, v4.8h, #6 + srshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + add v4.8h, v4.8h, v0.8h + add v5.8h, v5.8h, v0.8h + sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + subs w3, w3, #16 + st1 {v2.8b, v3.8b}, [x0], #16 + st1 {v4.8b, v5.8b}, [x6], #16 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #1 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 +4: + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b, v3.16b}, [x2] + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v2.4h, v2.4h, v3.4h + urshr v2.4h, v2.4h, #5 + dup v0.8h, v2.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x7 + +L(ipred_cfl_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2s}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2] + uaddlv h0, v0.8b + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b, v3.16b}, [x2] + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add v2.4h, v2.4h, v3.4h + urshr v2.4h, v2.4h, #5 + dup v0.8h, v2.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 + sub x2, x2, w4, uxtw + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.8h, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 26 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.8h, v16.8h, #1 // (width + height) >> 1 + dup v17.8h, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x7 + +L(ipred_cfl_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x2], #4 + ins v0.s[1], wzr + add x2, x2, #1 + uaddlv h0, v0.8b + br x9 +L(ipred_cfl_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.s}[0], [x2] + ins v2.s[1], wzr + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.8b + cmp w4, #4 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x3334/2) + movk w16, #(0x5556/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2], #8 + uaddlv h0, v0.8b + add x2, x2, #1 + br x9 +L(ipred_cfl_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.8b + cmp w4, #8 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x2], #16 + uaddlv h0, v0.16b + add x2, x2, #1 + br x9 +L(ipred_cfl_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.16b + cmp w4, #16 + add v0.4h, v0.4h, v2.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 4/8/32 + cmp w4, #4 + mov w16, #(0x3334/2) + mov w17, #(0x5556/2) + csel w16, w16, w17, eq + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b, v3.16b}, [x2], #32 + uaddlv h2, v2.16b + uaddlv h3, v3.16b + add x2, x2, #1 + add v0.4h, v2.4h, v3.4h + br x9 +L(ipred_cfl_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b, v3.16b}, [x2] + add v0.4h, v0.4h, v16.4h + uaddlv h2, v2.16b + uaddlv h3, v3.16b + cmp w4, #32 + add v0.4h, v0.4h, v2.4h + add v0.4h, v0.4h, v3.4h + ushl v0.4h, v0.4h, v17.4h + b.eq 1f + // h = 8/16 + mov w16, #(0x5556/2) + movk w16, #(0x3334/2), lsl #16 + add w17, w4, w4 // w17 = 2*h = 16 or 32 + lsr w16, w16, w17 + dup v16.4h, w16 + sqdmulh v0.4h, v0.4h, v16.4h +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v0.d}[1], [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + add v16.8h, v16.8h, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + b.gt 2b +3: + // Aggregate the sums + add v0.8h, v16.8h, v17.8h + uaddlv s0, v0.8h // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x10], x2 + ld1 {v2.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v3.16b}, [x10], x2 + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + add v0.8h, v0.8h, v1.8h + add v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v0.d}[1], [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + add v16.4h, v16.4h, v0.4h + add v17.4h, v17.4h, v1.4h + add v18.4h, v18.4h, v2.4h + add v19.4h, v19.4h, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + add v18.8h, v18.8h, v0.8h + add v19.8h, v19.8h, v1.8h + b.gt 2b +3: + +L(ipred_cfl_ac_420_w8_calc_subtract_dc): + // Aggregate the sums + add v0.8h, v16.8h, v17.8h + add v2.8h, v18.8h, v19.8h + uaddlp v0.4s, v0.8h + uaddlp v2.4s, v2.8h + add v0.4s, v0.4s, v2.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +L(ipred_cfl_ac_420_w8_subtract_dc): +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + sub v2.8h, v2.8h, v4.8h + sub v3.8h, v3.8h, v4.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v2.16b, v3.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + ld1 {v4.16b, v5.16b}, [x1], x2 + uaddlp v1.8h, v1.16b + ld1 {v6.16b, v7.16b}, [x10], x2 + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + uaddlp v4.8h, v4.16b + uaddlp v5.8h, v5.16b + uaddlp v6.8h, v6.16b + uaddlp v7.8h, v7.16b + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v1.8h, #1 + shl v2.8h, v4.8h, #1 + shl v3.8h, v5.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr d1, [x1, #16] + ld1 {v0.16b}, [x1], x2 + ldr d3, [x10, #16] + ld1 {v2.16b}, [x10], x2 + uaddlp v1.4h, v1.8b + ldr d5, [x1, #16] + uaddlp v0.8h, v0.16b + ld1 {v4.16b}, [x1], x2 + uaddlp v3.4h, v3.8b + ldr d7, [x10, #16] + uaddlp v2.8h, v2.16b + ld1 {v6.16b}, [x10], x2 + uaddlp v5.4h, v5.8b + uaddlp v4.8h, v4.16b + uaddlp v7.4h, v7.8b + uaddlp v6.8h, v6.16b + add v1.4h, v1.4h, v3.4h + add v0.8h, v0.8h, v2.8h + add v5.4h, v5.4h, v7.4h + add v4.8h, v4.8h, v6.8h + shl v1.4h, v1.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v5.4h, #1 + shl v2.8h, v4.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + ld1 {v4.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v6.16b}, [x10], x2 + uaddlp v2.8h, v2.16b + uaddlp v4.8h, v4.16b + uaddlp v6.8h, v6.16b + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + ld1 {v4.8b}, [x1], x2 + uaddlp v0.4h, v0.8b + ld1 {v6.8b}, [x10], x2 + uaddlp v2.4h, v2.8b + uaddlp v4.4h, v4.8b + uaddlp v6.4h, v6.8b + add v0.4h, v0.4h, v2.4h + add v4.4h, v4.4h, v6.4h + shl v0.4h, v0.4h, #1 + shl v2.4h, v4.4h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 2b +3: + + // Double the height and reuse the w8 summing/subtracting + lsl w6, w6, #1 + b L(ipred_cfl_ac_420_w8_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8b}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.8b}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x10], x2 + ld1 {v2.16b}, [x1], x2 + uaddlp v0.8h, v0.16b + ld1 {v3.16b}, [x10], x2 + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8b}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v2.8b}, [x1], x2 + ld1 {v2.d}[1], [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v2.8h, v2.16b + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.16b, v1.16b}, [x1], x2 + ld1 {v2.16b, v3.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v1.8h, v1.16b + uaddlp v2.8h, v2.16b + uaddlp v3.8h, v3.16b + shl v0.8h, v0.8h, #2 + shl v1.8h, v1.8h, #2 + shl v2.8h, v2.8h, #2 + shl v3.8h, v3.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr d1, [x1, #16] + ld1 {v0.16b}, [x1], x2 + ldr d3, [x10, #16] + ld1 {v2.16b}, [x10], x2 + uaddlp v1.4h, v1.8b + uaddlp v0.8h, v0.16b + uaddlp v3.4h, v3.8b + uaddlp v2.8h, v2.16b + shl v1.4h, v1.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v3.4h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + uaddlp v0.8h, v0.16b + uaddlp v2.8h, v2.16b + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + uaddlp v0.4h, v0.8b + uaddlp v2.4h, v2.8b + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.s}[0], [x1], x2 + ld1 {v0.s}[1], [x10], x2 + ld1 {v1.s}[0], [x1], x2 + ld1 {v1.s}[1], [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v1.8h, v1.8b, #3 + subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v2.8b}, [x1], x2 + ushll v0.8h, v0.8b, #3 + ld1 {v3.8b}, [x10], x2 + ushll v1.8h, v1.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v3.8h, v3.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + ld1 {v4.16b}, [x1], x2 + ushll2 v1.8h, v0.16b, #3 + ushll v0.8h, v0.8b, #3 + ld1 {v6.16b}, [x10], x2 + ushll2 v3.8h, v2.16b, #3 + ushll v2.8h, v2.8b, #3 + ushll2 v5.8h, v4.16b, #3 + ushll v4.8h, v4.8b, #3 + ushll2 v7.8h, v6.16b, #3 + ushll v6.8h, v6.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + ld1 {v4.8b}, [x1], x2 + ld1 {v6.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v4.8h, v4.8b, #3 + ushll v6.8h, v6.8b, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + dup v5.8h, v4.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, without padding + ld1 {v2.16b, v3.16b}, [x1], x2 + ld1 {v6.16b, v7.16b}, [x10], x2 + ushll v0.8h, v2.8b, #3 + ushll2 v1.8h, v2.16b, #3 + ushll v2.8h, v3.8b, #3 + ushll2 v3.8h, v3.16b, #3 + ushll v4.8h, v6.8b, #3 + ushll2 v5.8h, v6.16b, #3 + ushll v6.8h, v7.8b, #3 + ushll2 v7.8h, v7.16b, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 8 + ldr d2, [x1, #16] + ld1 {v1.16b}, [x1], x2 + ldr d6, [x10, #16] + ld1 {v5.16b}, [x10], x2 + ushll v2.8h, v2.8b, #3 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v6.8h, v6.8b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v3.8h, v2.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 16 + ld1 {v1.16b}, [x1], x2 + ld1 {v5.16b}, [x10], x2 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + dup v6.8h, v5.h[7] + dup v7.8h, v5.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 24 + ld1 {v0.8b}, [x1], x2 + ld1 {v4.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v4.8h, v4.8b, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + dup v5.8h, v4.h[7] + dup v6.8h, v4.h[7] + dup v7.8h, v4.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w8 subtracting + lsl w6, w6, #2 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + uaddlp v0.4s, v16.8h + uaddlp v1.4s, v17.8h + uaddlp v2.4s, v18.8h + uaddlp v3.4s, v19.8h + add v0.4s, v0.4s, v1.4s + add v2.4s, v2.4s, v3.4s + add v0.4s, v0.4s, v2.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] + b L(ipred_cfl_ac_420_w8_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S new file mode 100644 index 0000000000..c48c48583c --- /dev/null +++ b/third_party/dav1d/src/arm/64/ipred16.S @@ -0,0 +1,4204 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + ldr w8, [sp] + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + dup v0.8h, w8 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + urshr v0.8h, v0.8h, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 160b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + sub x1, x1, #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #8 + sub x5, x5, w3, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.4h}, [x0], x1 + st1 {v2.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + stp q3, q3, [x0, #64] + stp q2, q2, [x6, #64] + stp q3, q3, [x0, #96] + stp q2, q2, [x6, #96] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + stp q1, q1, [x0, #64] + stp q0, q0, [x6, #64] + stp q1, q1, [x0, #96] + stp q0, q0, [x6, #96] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.4h, v0.h[0] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w4): + AARCH64_VALID_JUMP_TARGET + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w8): + AARCH64_VALID_JUMP_TARGET + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] + br x3 +L(ipred_dc_left_w16): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b +1: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlp v0.4s, v0.8h + addv s0, v0.4s + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w32): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w64): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.4s, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + add x2, x2, #2 + br x3 +L(ipred_dc_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.4h + cmp w4, #4 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.4h, v0.h[0] +2: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + add x2, x2, #2 + br x3 +L(ipred_dc_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.8h + cmp w4, #8 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] +2: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2], #32 + addp v0.8h, v0.8h, v1.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + uaddlv s1, v1.8h + cmp w4, #16 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + addp v3.8h, v3.8h, v4.8h + addp v1.8h, v1.8h, v3.8h + uaddlv s1, v1.8h + cmp w4, #32 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w64): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] + addp v3.8h, v3.8h, v4.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + addp v1.8h, v1.8h, v3.8h + addp v20.8h, v20.8h, v22.8h + addp v1.8h, v1.8h, v20.8h + uaddlv s1, v1.8h + cmp w4, #64 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 16/32 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x2] + add x8, x2, #2 + sub x2, x2, #8 + sub x5, x5, w9, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v5.2d}, [x8] + sub v6.8h, v5.8h, v4.8h // top - topleft +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v2.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v4.8h, v16.8h // tldiff + sabd v23.8h, v4.8h, v17.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v2.8h, v17.8h + umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) + umin v19.8h, v21.8h, v23.8h + cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v23.8h, v21.8h + cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v19.8h, v17.8h + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 4b + ret +80: +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v5.8h}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 +2: + sub v6.8h, v5.8h, v4.8h // top - topleft + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v1.8h + add v18.8h, v6.8h, v2.8h + add v19.8h, v6.8h, v3.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v5.8h, v18.8h + sabd v23.8h, v5.8h, v19.8h + sabd v24.8h, v4.8h, v16.8h // tldiff + sabd v25.8h, v4.8h, v17.8h + sabd v26.8h, v4.8h, v18.8h + sabd v27.8h, v4.8h, v19.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v1.8h, v17.8h + sabd v18.8h, v2.8h, v18.8h + sabd v19.8h, v3.8h, v19.8h + umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) + umin v29.8h, v21.8h, v25.8h + umin v30.8h, v22.8h, v26.8h + umin v31.8h, v23.8h, v27.8h + cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v25.8h, v21.8h + cmge v22.8h, v26.8h, v22.8h + cmge v23.8h, v27.8h, v23.8h + cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v29.8h, v17.8h + cmge v18.8h, v30.8h, v18.8h + cmge v19.8h, v31.8h, v19.8h + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + st1 {v23.8h}, [x0], #16 + st1 {v22.8h}, [x6], #16 + subs w3, w3, #8 + st1 {v21.8h}, [x5], #16 + st1 {v20.8h}, [x10], #16 + b.le 8f + ld1 {v5.8h}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.8h}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x12] // bottom + add x8, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2d}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[3] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v0.8h, v7.8h + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v18.4h + smlal2 v23.4s, v6.8h, v18.8h + rshrn v20.4h, v20.4s, #9 + rshrn v21.4h, v21.4s, #9 + rshrn v22.4h, v22.4s, #9 + rshrn v23.4h, v23.4s, #9 + st1 {v20.4h}, [x0], x1 + st1 {v21.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.4h}, [x0], x1 + st1 {v23.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8h}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[7] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v3.8h, v7.8h // (left flipped) + smlal v22.4s, v2.4h, v7.4h + smlal2 v23.4s, v2.8h, v7.8h + smlal v24.4s, v1.4h, v7.4h + smlal2 v25.4s, v1.8h, v7.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v17.4h + smlal2 v23.4s, v6.8h, v17.8h + smlal v24.4s, v6.4h, v18.4h + smlal2 v25.4s, v6.8h, v18.8h + smlal v26.4s, v6.4h, v19.4h + smlal2 v27.4s, v6.8h, v19.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + add x12, x2, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + ld1r {v5.8h}, [x12] // right + sub x2, x2, #4 + mov x7, #-4 + mov w9, w3 + add v31.4h, v4.4h, v5.4h // bottom+right + +1: + ld2r {v0.8h, v1.8h}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v2.8h, v3.8h}, [x8], #32 // top + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor + smlal2 v21.4s, v1.8h, v6.8h // (left flipped) + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v24.4s, v0.4h, v6.4h + smlal2 v25.4s, v0.8h, v6.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v2.8h, v16.8h + smlal v22.4s, v3.4h, v16.4h + smlal2 v23.4s, v3.8h, v16.8h + smlal v24.4s, v2.4h, v17.4h + smlal2 v25.4s, v2.8h, v17.8h + smlal v26.4s, v3.4h, v17.4h + smlal2 v27.4s, v3.8h, v17.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw #1 + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x8] // bottom + add x2, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2d}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v18.8h, v18.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8h}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v17.8h + sqrdmulh v22.8h, v6.8h, v18.8h + sqrdmulh v23.8h, v6.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 +2: + ld1 {v2.8h, v3.8h}, [x2], #32 // top + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v16.8h + sqrdmulh v22.8h, v2.8h, v17.8h + sqrdmulh v23.8h, v3.8h, v17.8h + sqrdmulh v24.8h, v2.8h, v18.8h + sqrdmulh v25.8h, v3.8h, v18.8h + sqrdmulh v26.8h, v2.8h, v19.8h + sqrdmulh v27.8h, v3.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v4.8h + add v26.8h, v26.8h, v4.8h + add v27.8h, v27.8h, v4.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x8], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.8h}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v1.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v3.8h, v3.8h, v5.8h // left-right + sub v2.8h, v2.8h, v5.8h + sub v1.8h, v1.8h, v5.8h + sub v0.8h, v0.8h, v5.8h + sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v1.8h, v7.8h + sqrdmulh v23.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + sub x2, x2, #8 + mov x7, #-8 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + ushll v6.8h, v7.8b, #7 // weights_hor << 7 + ushll2 v7.8h, v7.16b, #7 + sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v2.8h, v6.8h + sqrdmulh v23.8h, v2.8h, v7.8h + sqrdmulh v24.8h, v1.8h, v6.8h + sqrdmulh v25.8h, v1.8h, v7.8h + sqrdmulh v26.8h, v0.8h, v6.8h + sqrdmulh v27.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + add v24.8h, v24.8h, v5.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v5.8h + add v27.8h, v27.8h, v5.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x10], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end, +// const int bitdepth_max); +function ipred_z1_upsample_edge_16bpc_neon, export=1 + dup v30.8h, w4 // bitdepth_max + movrel x4, padding_mask + ld1 {v0.8h, v1.8h}, [x2] // in[] + add x5, x2, w3, uxtw #1 // in[end] + sub x4, x4, w3, uxtw #1 + + ld1r {v2.8h}, [x5] // padding + ld1 {v3.8h, v4.8h}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v2.16b, v3.16b // padded in[] + bit v1.16b, v2.16b, v4.16b + + ext v4.16b, v0.16b, v1.16b, #2 + ext v5.16b, v1.16b, v2.16b, #2 + ext v6.16b, v0.16b, v1.16b, #4 + ext v7.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + + add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] + add v19.8h, v5.8h, v7.8h + add v20.8h, v0.8h, v16.8h + add v21.8h, v1.8h, v17.8h + umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) + umull2 v23.4s, v18.8h, v31.8h + umull v24.4s, v19.4h, v31.4h + umull2 v25.4s, v19.8h, v31.8h + usubw v22.4s, v22.4s, v20.4h + usubw2 v23.4s, v23.4s, v20.8h + usubw v24.4s, v24.4s, v21.4h + usubw2 v25.4s, v25.4s, v21.8h + + sqrshrun v16.4h, v22.4s, #4 + sqrshrun2 v16.8h, v23.4s, #4 + sqrshrun v17.4h, v24.4s, #4 + sqrshrun2 v17.8h, v25.4s, #4 + + smin v16.8h, v16.8h, v30.8h + smin v17.8h, v17.8h, v30.8h + + zip1 v0.8h, v4.8h, v16.8h + zip2 v1.8h, v4.8h, v16.8h + zip1 v2.8h, v5.8h, v17.8h + zip2 v3.8h, v5.8h, v17.8h + + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + + ret +endfunc + +const edge_filter + .short 0, 4, 8, 0 + .short 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_16bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -6 + add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) + + ld1 {v31.s}[0], [x5] // kernel[1-2] + + ld1 {v0.8h}, [x2], #16 + + dup v30.8h, v31.h[0] + dup v31.8h, v31.h[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + sub w3, w3, #16 + st1 {v16.8h, v17.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask + sub w6, w3, #24 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h}, [x5] // padding_mask + + ld1r {v2.8h}, [x6] + bit v0.16b, v2.16b, v3.16b // Pad v0-v1 + bit v1.16b, v2.16b, v4.16b + + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + st1 {v16.8h, v17.8h}, [x0], #32 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v2.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #2 // topleft -= 1 pixel + movi v29.8h, #2 + ld1 {v0.8h}, [x2], #16 + movi v30.8h, #4 + movi v31.8h, #4 + ins v0.h[0], v0.h[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f // if (end + 1 < 19) + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask, -2 + sub w6, w3, #23 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask + + ld1r {v28.8h}, [x6] + bit v0.16b, v28.16b, v3.16b // Pad v0-v2 + bit v1.16b, v28.16b, v4.16b + bit v2.16b, v28.16b, v5.16b +4: + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + mov v1.16b, v28.16b + mov v2.16b, v28.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #8 + st1 {v28.8h}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_16bpc_neon, export=1 + dup v0.8h, w1 +1: + subs w2, w2, #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_16bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // top[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // xpos += dx +2: + ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w3, w3, #16 + umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + st1 {v22.8h, v23.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x13], #32 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.8h}, [x0], #16 + subs w3, w3, #8 + st1 {v31.8h}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_16bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_16bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.8h}, [x1] + subs w2, w2, #8 + rev64 v0.8h, v0.8h + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_16bpc_neon, export=1 + clz w9, w4 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // left[max_base_y] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + br x8 + +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // left[base] + ldr q2, [x2, w10, uxtw] + dup v4.8h, w9 // frac + dup v5.8h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // left[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // ypos += dy +2: + ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w4, w4, #16 + umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + zip1 v18.8h, v22.8h, v24.8h + zip2 v19.8h, v22.8h, v24.8h + zip1 v20.8h, v23.8h, v25.8h + zip2 v21.8h, v23.8h, v25.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x13], x1 + st1 {v20.s}[2], [x0], x1 + st1 {v20.s}[3], [x13], x1 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x13], x1 + st1 {v21.s}[2], [x0], x1 + st1 {v21.s}[3], [x13], x1 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #8 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +4: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.4h}, [x13], x1 + st1 {v31.4h}, [x0], x1 + st1 {v31.4h}, [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +8: +16: +32: +64: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8h}, [x13], x1 + st1 {v31.8h}, [x0], x1 + st1 {v31.8h}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 8. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw #1 +1: + ands w5, w3, #7 + b.eq 2f + // If the width isn't aligned to 8, first do one 8 pixel write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.8h}, [x0] + add x0, x0, w5, uxtw #1 +2: + // Fill the rest of the line with aligned 8 pixel writes. + subs w3, w3, #8 + st1 {v31.8h}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_16bpc_neon, export=1 + cmp w4, #8 + add x10, x2, w6, uxtw // left[max_base_y] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + b.eq 8f + +4: // h == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +8: // h == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret +endfunc + + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter\bpc\()_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + dup v31.8h, w8 +.if \bpc == 10 + movi v30.8h, #0 +.endif + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ldur d0, [x2, #2] // top (0-3) + sub x2, x2, #4 + mov x7, #-4 +4: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 +.endif + smin v2.8h, v2.8h, v31.8h + subs w4, w4, #2 + st1 {v2.d}[0], [x0], x1 + ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] + st1 {v2.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ldur q0, [x2, #2] // top (0-7) + sub x2, x2, #4 + mov x7, #-4 +8: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) + smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) + smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 + smin v2.8h, v2.8h, v31.8h + smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) + smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) + smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) + smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) + smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) + smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.4h, v4.4s, #4 + sqrshrun2 v3.8h, v5.4s, #4 +.endif + smin v3.8h, v3.8h, v31.8h + subs w4, w4, #2 + st2 {v2.d, v3.d}[0], [x0], x1 + zip2 v0.2d, v2.2d, v3.2d + st2 {v2.d, v3.d}[1], [x6], x1 + b.gt 8b + ret +160: +320: + AARCH64_VALID_JUMP_TARGET + add x8, x2, #2 + sub x2, x2, #4 + mov x7, #-4 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) +2: + ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) +.if \bpc == 10 + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h + smin v3.8h, v3.8h, v31.8h + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + srshr v4.8h, v4.8h, #4 + smax v4.8h, v4.8h, v30.8h + smin v4.8h, v4.8h, v31.8h + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + srshr v5.8h, v5.8h, #4 + smax v5.8h, v5.8h, v30.8h + smin v5.8h, v5.8h, v31.8h + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + srshr v6.8h, v6.8h, #4 + smax v6.8h, v6.8h, v30.8h +.else + smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) + smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) + smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) + smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) + smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) + smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) + smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) + smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) + smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) + smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) + smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.4h, v3.4s, #4 + sqrshrun2 v3.8h, v4.4s, #4 + smin v3.8h, v3.8h, v31.8h + smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) + smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) + smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) + smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) + smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) + smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) + smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) + smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.4h, v5.4s, #4 + sqrshrun2 v4.8h, v6.4s, #4 + smin v4.8h, v4.8h, v31.8h + smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) + smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) + smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) + smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) + smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) + smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) + smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) + smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.4h, v24.4s, #4 + sqrshrun2 v5.8h, v25.4s, #4 + smin v5.8h, v5.8h, v31.8h + smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) + smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) + smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) + smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) + smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) + smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.4h, v26.4s, #4 + sqrshrun2 v6.8h, v27.4s, #4 +.endif + smin v6.8h, v6.8h, v31.8h + + ins v0.h[2], v2.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 + ins v0.h[0], v6.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 + ins v0.h[1], v6.h[3] + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter\bpc\()_tbl): + .hword L(ipred_filter\bpc\()_tbl) - 320b + .hword L(ipred_filter\bpc\()_tbl) - 160b + .hword L(ipred_filter\bpc\()_tbl) - 80b + .hword L(ipred_filter\bpc\()_tbl) - 40b +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + ldr w8, [sp] + cmp w8, 0x3ff + b.le ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + ld1 {v30.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + movi v31.8h, #1, lsl #8 + sub x6, x6, w9, uxtw + br x6 +40: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +4: + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + add v1.16b, v1.16b, v1.16b + zip1 v0.16b, v1.16b, v1.16b + zip2 v1.16b, v1.16b, v1.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + st1 {v0.d}[0], [x0], x1 + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.d}[1], [x2], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x2], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +8: + ld1 {v2.16b, v3.16b}, [x3], #32 + subs w5, w5, #4 + add v2.16b, v2.16b, v2.16b + add v3.16b, v3.16b, v3.16b + zip1 v0.16b, v2.16b, v2.16b + zip2 v1.16b, v2.16b, v2.16b + zip1 v2.16b, v3.16b, v3.16b + zip2 v3.16b, v3.16b, v3.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.8h}, [x0], x1 + tbl v2.16b, {v30.16b}, v2.16b + st1 {v1.8h}, [x2], x1 + tbl v3.16b, {v30.16b}, v3.16b + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x2], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +16: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #4 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + st1 {v2.8h, v3.8h}, [x2], x1 + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x2], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +32: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #2 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + add x2, x0, #64 +64: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #1 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 640b + .hword L(pal_pred_tbl) - 320b + .hword L(pal_pred_tbl) - 160b + .hword L(pal_pred_tbl) - 80b + .hword L(pal_pred_tbl) - 40b +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + urshr v0.8h, v31.8h, #1 + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +L(ipred_cfl_splat_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #4 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x6], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v3.d}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #2 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + AARCH64_VALID_JUMP_TARGET + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + subs w3, w3, #16 + smull v16.4s, v2.4h, v1.4h // diff = ac * alpha + smull2 v17.4s, v2.8h, v1.8h + smull v18.4s, v3.4h, v1.4h + smull2 v19.4s, v3.8h, v1.8h + smull v2.4s, v4.4h, v1.4h + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v20.4s, v16.4s, #0 // sign + cmlt v21.4s, v17.4s, #0 + cmlt v22.4s, v18.4s, #0 + cmlt v23.4s, v19.4s, #0 + cmlt v24.4s, v2.4s, #0 + cmlt v25.4s, v3.4s, #0 + cmlt v26.4s, v4.4s, #0 + cmlt v27.4s, v5.4s, #0 + add v16.4s, v16.4s, v20.4s // diff + sign + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v23.4s + add v2.4s, v2.4s, v24.4s + add v3.4s, v3.4s, v25.4s + add v4.4s, v4.4s, v26.4s + add v5.4s, v5.4s, v27.4s + rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + rshrn v6.4h, v2.4s, #6 + rshrn2 v6.8h, v3.4s, #6 + rshrn v7.4h, v4.4s, #6 + rshrn2 v7.8h, v5.4s, #6 + add v2.8h, v16.8h, v0.8h // dc + apply_sign() + add v3.8h, v17.8h, v0.8h + add v4.8h, v6.8h, v0.8h + add v5.8h, v7.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.8h, v5.8h}, [x6], #32 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #2 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.4s, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 26 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + add x2, x2, #2 + br x9 +L(ipred_cfl_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.4h + cmp w4, #4 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + add x2, x2, #2 + br x9 +L(ipred_cfl_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.8h + cmp w4, #8 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2], #32 + addp v0.8h, v2.8h, v3.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v2.8h, v2.8h, v3.8h + uaddlv s2, v2.8h + cmp w4, #16 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + add v0.4s, v0.4s, v16.4s + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v2.8h, v2.8h, v4.8h + cmp w4, #32 + uaddlv s2, v2.8h + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + add v24.4s, v24.4s, v25.4s + add v26.4s, v26.4s, v27.4s + add v0.4s, v24.4s, v26.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v4.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw v25.4s, v25.4s, v1.4h + uaddw v26.4s, v26.4s, v2.4h + uaddw v27.4s, v27.4s, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl w6, w6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 + add v0.8h, v0.8h, v4.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 + add v2.8h, v2.8h, v6.8h + addp v16.8h, v16.8h, v17.8h + addp v18.8h, v18.8h, v19.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + add v16.8h, v16.8h, v20.8h + add v18.8h, v18.8h, v22.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + shl v2.8h, v16.8h, #1 + shl v3.8h, v18.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q5, [x10, #32] + ld1 {v3.8h, v4.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v5.8h, v5.8h, v5.8h + addp v3.8h, v3.8h, v4.8h + ldr q18, [x1, #32] + add v2.4h, v2.4h, v5.4h + ld1 {v16.8h, v17.8h}, [x1], x2 + add v0.8h, v0.8h, v3.8h + ldr q21, [x10, #32] + ld1 {v19.8h, v20.8h}, [x10], x2 + addp v18.8h, v18.8h, v18.8h + addp v16.8h, v16.8h, v17.8h + addp v21.8h, v21.8h, v21.8h + addp v19.8h, v19.8h, v20.8h + add v18.4h, v18.4h, v21.4h + add v16.8h, v16.8h, v19.8h + shl v1.4h, v2.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v18.4h, #1 + shl v2.8h, v16.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + ld1 {v4.8h}, [x1], x2 + ld1 {v6.8h}, [x10], x2 + addp v0.8h, v0.8h, v4.8h + addp v2.8h, v2.8h, v6.8h + add v0.8h, v0.8h, v2.8h + shl v0.8h, v0.8h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v0.h[7] + trn2 v2.2d, v0.2d, v3.2d + trn1 v0.2d, v0.2d, v1.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl w6, w6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q6, [x10, #32] + ld1 {v4.8h, v5.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v6.8h, v6.8h, v6.8h + addp v4.8h, v4.8h, v5.8h + shl v1.4h, v2.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v6.4h, #2 + shl v2.8h, v4.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + addp v0.8h, v0.8h, v0.8h + addp v2.8h, v2.8h, v2.8h + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.4h}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.4h}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + ld1 {v3.8h}, [x10], x2 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v2.8h, v2.8h, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + lsr x2, x2, #1 // Restore the stride to one line increments + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 8 + ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 + shl v2.8h, v2.8h, #3 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + dup v3.8h, v2.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 16 + ld1 {v0.8h, v1.8h}, [x1], x2 + shl v1.8h, v1.8h, #3 + shl v0.8h, v0.8h, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 24 + ld1 {v0.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl w6, w6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S new file mode 100644 index 0000000000..b1b2f8fe65 --- /dev/null +++ b/third_party/dav1d/src/arm/64/itx.S @@ -0,0 +1,3270 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + .short 0, 0, 0, 0 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + .short 0, 0, 0, 0 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + .short 0, 0, 0, 0 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 + .short 0, 0, 0, 0 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro smull_smlal d0, d1, s0, s1, c0, c1, sz + smull \d0\().4s, \s0\().4h, \c0 + smlal \d0\().4s, \s1\().4h, \c1 +.ifc \sz, .8h + smull2 \d1\().4s, \s0\().8h, \c0 + smlal2 \d1\().4s, \s1\().8h, \c1 +.endif +.endm + +.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz + smull \d0\().4s, \s0\().4h, \c0 + smlsl \d0\().4s, \s1\().4h, \c1 +.ifc \sz, .8h + smull2 \d1\().4s, \s0\().8h, \c0 + smlsl2 \d1\().4s, \s1\().8h, \c1 +.endif +.endm + +.macro sqrshrn_sz d0, s0, s1, shift, sz + sqrshrn \d0\().4h, \s0\().4s, \shift +.ifc \sz, .8h + sqrshrn2 \d0\().8h, \s1\().4s, \shift +.endif +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + uaddw \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + sqxtun \narrowdst, \narrowsrc +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src + load_add_store v3.8b, v17.8h, , , , , , \dst, \src + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src + load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src + load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src + load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src + load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src + load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src + load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src + load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src + load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src + load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src + load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src + load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src + load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src + load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src + load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src + load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src + load_add_store , , , , , , v5.8b, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits + load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits + load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits + load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits + load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits + load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits + load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits + load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store v2.8b, v16.8h, , , , , , \dst, \src + load_add_store v3.8b, v17.8h, , , , , , \dst, \src + load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src + load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src + load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src + load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src + load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src + load_add_store , , , , , , v5.8b, \dst, \src +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + uaddw \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \narrowsrc + sqxtun \narrowdst, \narrowsrc +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src + load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src + load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src + load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src + load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src + load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src + load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src + load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src + load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src + load_add_store4 , , , , , , , , v7.s, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src + load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src + load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src + load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src + load_add_store4 , , , , , , , , v3.s, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + mov w16, #2896*8 + ld1r {v16.8h}, [x2] + dup v0.4h, w16 + sqrdmulh v16.8h, v16.8h, v0.h[0] + strh wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v16.8h, v16.8h, v0.h[0] +.endif +.if \shift > 0 + srshr v16.8h, v16.8h, #\shift +.endif + sqrdmulh v16.8h, v16.8h, v0.h[0] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + subs w4, w4, #4 + sub x0, x0, x1, lsl #2 + uaddw v0.8h, v16.8h, v0.8b + sqxtun v0.8b, v0.8h + uaddw v1.8h, v16.8h, v1.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v1.8b, v1.8h + st1 {v0.s}[1], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon +1: + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + sub x0, x0, x1, lsl #2 + subs w4, w4, #4 + uaddw v21.8h, v16.8h, v1.8b + sqxtun v0.8b, v20.8h + uaddw v22.8h, v16.8h, v2.8b + sqxtun v1.8b, v21.8h + uaddw v23.8h, v16.8h, v3.8b + st1 {v0.8b}, [x0], x1 + sqxtun v2.8b, v22.8h + st1 {v1.8b}, [x0], x1 + sqxtun v3.8b, v23.8h + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon +1: + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 + ld1 {v2.16b}, [x0], x1 + subs w4, w4, #4 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + ld1 {v3.16b}, [x0], x1 + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + sub x0, x0, x1, lsl #2 + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + st1 {v0.16b}, [x0], x1 + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon +1: + ld1 {v0.16b, v1.16b}, [x0], x1 + subs w4, w4, #2 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + ld1 {v2.16b, v3.16b}, [x0] + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + sub x0, x0, x1 + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + st1 {v0.16b, v1.16b}, [x0], x1 + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v2.16b, v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon +1: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + subs w4, w4, #1 + uaddw v20.8h, v16.8h, v0.8b + uaddw2 v21.8h, v16.8h, v0.16b + uaddw v22.8h, v16.8h, v1.8b + uaddw2 v23.8h, v16.8h, v1.16b + uaddw v24.8h, v16.8h, v2.8b + uaddw2 v25.8h, v16.8h, v2.16b + sqxtun v0.8b, v20.8h + sqxtun2 v0.16b, v21.8h + uaddw v26.8h, v16.8h, v3.8b + uaddw2 v27.8h, v16.8h, v3.16b + sqxtun v1.8b, v22.8h + sqxtun2 v1.16b, v23.8h + sqxtun v2.8b, v24.8h + sqxtun2 v2.16b, v25.8h + sqxtun v3.8b, v26.8h + sqxtun2 v3.16b, v27.8h + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4h, v16.4h, v17.4h + sub v21.4h, v18.4h, v19.4h + sub v20.4h, v16.4h, v21.4h + sshr v20.4h, v20.4h, #1 + sub v18.4h, v20.4h, v17.4h + sub v17.4h, v20.4h, v19.4h + add v19.4h, v21.4h, v18.4h + sub v16.4h, v16.4h, v17.4h +.endm + +.macro idct_4 r0, r1, r2, r3, sz + smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz + smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz + smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz + sqrshrn_sz v6, v6, v7, #12, \sz + sqrshrn_sz v7, v4, v5, #12, \sz + smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz + sqrshrn_sz v2, v2, v3, #12, \sz + sqrshrn_sz v3, v4, v5, #12, \sz + sqadd \r0\sz, v2\sz, v6\sz + sqsub \r3\sz, v2\sz, v6\sz + sqadd \r1\sz, v3\sz, v7\sz + sqsub \r2\sz, v3\sz, v7\sz +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] + idct_4 v16, v17, v18, v19, .4h + ret +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] + idct_4 v16, v17, v18, v19, .8h + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.8h}, [x16] + + ssubl v3.4s, v16.4h, v18.4h + smull v4.4s, v16.4h, v0.h[0] + smlal v4.4s, v18.4h, v0.h[1] + smlal v4.4s, v19.4h, v0.h[2] + smull v7.4s, v17.4h, v0.h[3] + saddw v3.4s, v3.4s, v19.4h + smull v5.4s, v16.4h, v0.h[2] + smlsl v5.4s, v18.4h, v0.h[0] + smlsl v5.4s, v19.4h, v0.h[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[2] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + sqrshrn \o0\().4h, \o0\().4s, #12 + sqrshrn \o2\().4h, \o2\().4s, #12 + sqrshrn \o1\().4h, \o1\().4s, #12 + sqrshrn \o3\().4h, \o3\().4s, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +.macro iadst_8x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.8h}, [x16] + + ssubl v2.4s, v16.4h, v18.4h + ssubl2 v3.4s, v16.8h, v18.8h + smull v4.4s, v16.4h, v0.h[0] + smlal v4.4s, v18.4h, v0.h[1] + smlal v4.4s, v19.4h, v0.h[2] + smull2 v5.4s, v16.8h, v0.h[0] + smlal2 v5.4s, v18.8h, v0.h[1] + smlal2 v5.4s, v19.8h, v0.h[2] + saddw v2.4s, v2.4s, v19.4h + saddw2 v3.4s, v3.4s, v19.8h + smull v6.4s, v16.4h, v0.h[2] + smlsl v6.4s, v18.4h, v0.h[0] + smlsl v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v16.8h, v0.h[2] + smlsl2 v7.4s, v18.8h, v0.h[0] + smlsl2 v7.4s, v19.8h, v0.h[1] + + mul v18.4s, v2.4s, v0.s[2] + mul v19.4s, v3.4s, v0.s[2] + + smull v2.4s, v17.4h, v0.h[3] + smull2 v3.4s, v17.8h, v0.h[3] + + add v16.4s, v4.4s, v2.4s // out0 + add v17.4s, v5.4s, v3.4s + + add v4.4s, v4.4s, v6.4s // out3 + add v5.4s, v5.4s, v7.4s + + add v6.4s, v6.4s, v2.4s // out1 + add v7.4s, v7.4s, v3.4s + + sub v4.4s, v4.4s, v2.4s // out3 + sub v5.4s, v5.4s, v3.4s + + sqrshrn v18.4h, v18.4s, #12 + sqrshrn2 v18.8h, v19.4s, #12 + + sqrshrn \o0\().4h, v16.4s, #12 + sqrshrn2 \o0\().8h, v17.4s, #12 + +.ifc \o2, v17 + mov v17.16b, v18.16b +.endif + + sqrshrn \o1\().4h, v6.4s, #12 + sqrshrn2 \o1\().8h, v7.4s, #12 + + sqrshrn \o3\().4h, v4.4s, #12 + sqrshrn2 \o3\().8h, v5.4s, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4h_x4_neon, export=1 + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + sqrdmulh v4.4h, v16.4h, v0.h[0] + sqrdmulh v5.4h, v17.4h, v0.h[0] + sqrdmulh v6.4h, v18.4h, v0.h[0] + sqrdmulh v7.4h, v19.4h, v0.h[0] + sqadd v16.4h, v16.4h, v4.4h + sqadd v17.4h, v17.4h, v5.4h + sqadd v18.4h, v18.4h, v6.4h + sqadd v19.4h, v19.4h, v7.4h + ret +endfunc + +function inv_identity_8h_x4_neon, export=1 + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + sqrdmulh v4.8h, v16.8h, v0.h[0] + sqrdmulh v5.8h, v17.8h, v0.h[0] + sqrdmulh v6.8h, v18.8h, v0.h[0] + sqrdmulh v7.8h, v19.8h, v0.h[0] + sqadd v16.8h, v16.8h, v4.8h + sqadd v17.8h, v17.8h, v5.8h + sqadd v18.8h, v18.8h, v6.8h + sqadd v19.8h, v19.8h, v7.8h + ret +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h + sqrdmulh v2.8h, \i, \c + srhadd \i, \i, v2.8h +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + mov x15, x30 + movi v31.8h, #0 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v31.8h}, [x2], #16 + + sshr v16.4h, v16.4h, #2 + sshr v17.4h, v17.4h, #2 + sshr v18.4h, v18.4h, #2 + sshr v19.4h, v19.4h, #2 + + iwht4 + + st1 {v31.8h}, [x2], #16 + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v31.8h, #0 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v31.8h}, [x2], #16 + + blr x4 + + st1 {v31.8h}, [x2], #16 + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.s}[0], [x0], x1 + ld1 {v1.s}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + sub x0, x0, x1, lsl #2 + uaddw v16.8h, v16.8h, v0.8b + sqxtun v0.8b, v16.8h + uaddw v18.8h, v18.8h, v1.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v1.8b, v18.8h + st1 {v0.s}[1], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[1], [x0], x1 + + ret x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + mov w16, #2896*8 + ld1r {v16.8h}, [x2] + dup v4.8h, w16 + strh wzr, [x2] + sqrdmulh v16.8h, v16.8h, v4.h[0] + ld1 {v0.s}[0], [x0], x1 + sqrdmulh v20.8h, v16.8h, v4.h[0] + ld1 {v0.s}[1], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.s}[0], [x0], x1 + srshr v18.8h, v20.8h, #4 + ld1 {v1.s}[1], [x0], x1 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4h_x4_neon + adr x5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb + idct_4 \r0, \r2, \r4, \r6, \sz + + smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a + smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a + smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a + sqrshrn_sz \r1, v2, v3, #12, \sz // t4a + sqrshrn_sz \r7, v4, v5, #12, \sz // t7a + smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a + sqrshrn_sz \r3, v6, v7, #12, \sz // t5a + sqrshrn_sz \r5, v2, v3, #12, \sz // t6a + + sqadd v2\sz, \r1\sz, \r3\sz // t4 + sqsub \r1\sz, \r1\sz, \r3\sz // t5a + sqadd v3\sz, \r7\sz, \r5\sz // t7 + sqsub \r3\sz, \r7\sz, \r5\sz // t6a + + smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 + smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 + sqrshrn_sz v4, v4, v5, #12, \sz // t5 + sqrshrn_sz v5, v6, v7, #12, \sz // t6 + + sqsub \r7\sz, \r0\sz, v3\sz // out7 + sqadd \r0\sz, \r0\sz, v3\sz // out0 + sqadd \r1\sz, \r2\sz, v5\sz // out1 + sqsub v6\sz, \r2\sz, v5\sz // out6 + sqadd \r2\sz, \r4\sz, v4\sz // out2 + sqsub \r5\sz, \r4\sz, v4\sz // out5 + sqadd \r3\sz, \r6\sz, v2\sz // out3 + sqsub \r4\sz, \r6\sz, v2\sz // out4 + mov \r6\szb, v6\szb // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b + ret +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz + movrel x16, iadst8_coeffs + ld1 {v0.8h, v1.8h}, [x16] + + smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz + smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz + smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz + sqrshrn_sz v16, v2, v3, #12, \sz // t0a + sqrshrn_sz v23, v4, v5, #12, \sz // t1a + smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz + smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz + sqrshrn_sz v18, v6, v7, #12, \sz // t2a + sqrshrn_sz v21, v2, v3, #12, \sz // t3a + smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz + smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz + sqrshrn_sz v20, v4, v5, #12, \sz // t4a + sqrshrn_sz v19, v6, v7, #12, \sz // t5a + smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz + sqrshrn_sz v22, v2, v3, #12, \sz // t6a + sqrshrn_sz v17, v4, v5, #12, \sz // t7a + + sqadd v2\sz, v16\sz, v20\sz // t0 + sqsub v3\sz, v16\sz, v20\sz // t4 + sqadd v4\sz, v23\sz, v19\sz // t1 + sqsub v5\sz, v23\sz, v19\sz // t5 + sqadd v6\sz, v18\sz, v22\sz // t2 + sqsub v7\sz, v18\sz, v22\sz // t6 + sqadd v18\sz, v21\sz, v17\sz // t3 + sqsub v19\sz, v21\sz, v17\sz // t7 + + smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz + smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz + smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz + + sqrshrn_sz v3, v16, v17, #12, \sz // t4a + sqrshrn_sz v5, v20, v21, #12, \sz // t5a + + smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz + + sqrshrn_sz v7, v22, v23, #12, \sz // t6a + sqrshrn_sz v19, v16, v17, #12, \sz // t7a + + sqadd \o0\()\sz, v2\sz, v6\sz // out0 + sqsub v2\sz, v2\sz, v6\sz // t2 + sqadd \o7\()\sz, v4\sz, v18\sz // out7 + sqsub v4\sz, v4\sz, v18\sz // t3 + sqneg \o7\()\sz, \o7\()\sz // out7 + + sqadd \o1\()\sz, v3\sz, v7\sz // out1 + sqsub v3\sz, v3\sz, v7\sz // t6 + sqadd \o6\()\sz, v5\sz, v19\sz // out6 + sqsub v5\sz, v5\sz, v19\sz // t7 + sqneg \o1\()\sz, \o1\()\sz // out1 + + smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) + smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) + smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) + sqrshrn_sz v2, v18, v19, #12, \sz // out3 + smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) + sqrshrn_sz v3, v20, v21, #12, \sz // out5 + sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) + sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) + + sqneg \o3\()\sz, v2\sz // out3 + sqneg \o5\()\sz, v3\sz // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h + ret +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h + ret +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h + ret +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h + ret +endfunc + +function inv_identity_8h_x8_neon, export=1 + sqshl v16.8h, v16.8h, #1 + sqshl v17.8h, v17.8h, #1 + sqshl v18.8h, v18.8h, #1 + sqshl v19.8h, v19.8h, #1 + sqshl v20.8h, v20.8h, #1 + sqshl v21.8h, v21.8h, #1 + sqshl v22.8h, v22.8h, #1 + sqshl v23.8h, v23.8h, #1 + ret +endfunc + +function inv_identity_4h_x8_neon, export=1 + sqshl v16.4h, v16.4h, #1 + sqshl v17.4h, v17.4h, #1 + sqshl v18.4h, v18.4h, #1 + sqshl v19.4h, v19.4h, #1 + sqshl v20.4h, v20.4h, #1 + sqshl v21.4h, v21.4h, #1 + sqshl v22.4h, v22.4h, #1 + sqshl v23.4h, v23.4h, #1 + ret +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blr x4 + + srshr v16.8h, v16.8h, #1 + srshr v17.8h, v17.8h, #1 + srshr v18.8h, v18.8h, #1 + srshr v19.8h, v19.8h, #1 + srshr v20.8h, v20.8h, #1 + srshr v21.8h, v21.8h, #1 + srshr v22.8h, v22.8h, #1 + srshr v23.8h, v23.8h, #1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + blr x5 + + load_add_store_8x8 x0, x7 + ret x15 +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + adr x5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + adr x4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + movi v30.8h, #0 + movi v31.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] + st1 {v30.8h,v31.8h}, [x2], #32 + ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2] + st1 {v30.8h,v31.8h}, [x2] + + scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + ret x15 +endfunc + +function inv_txfm_add_4x8_neon + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] + + scale_input .8h, v0.h[0], v16, v17, v18, v19 + + blr x4 + + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + ins v20.d[0], v16.d[1] + ins v21.d[0], v17.d[1] + ins v22.d[0], v18.d[1] + ins v23.d[0], v19.d[1] + + blr x5 + + load_add_store_4x8 x0, x7 + ret x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon + adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +.macro idct_16 sz, szb + idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb + + smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a + smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a + smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a + sqrshrn_sz v17, v2, v3, #12, \sz // t8a + sqrshrn_sz v31, v4, v5, #12, \sz // t15a + smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a + smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a + sqrshrn_sz v23, v6, v7, #12, \sz // t9a + sqrshrn_sz v25, v2, v3, #12, \sz // t14a + smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a + smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a + sqrshrn_sz v21, v4, v5, #12, \sz // t10a + sqrshrn_sz v27, v6, v7, #12, \sz // t13a + smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a + sqrshrn_sz v19, v2, v3, #12, \sz // t11a + sqrshrn_sz v29, v4, v5, #12, \sz // t12a + + sqsub v2\sz, v17\sz, v23\sz // t9 + sqadd v17\sz, v17\sz, v23\sz // t8 + sqsub v3\sz, v31\sz, v25\sz // t14 + sqadd v31\sz, v31\sz, v25\sz // t15 + sqsub v23\sz, v19\sz, v21\sz // t10 + sqadd v19\sz, v19\sz, v21\sz // t11 + sqadd v25\sz, v29\sz, v27\sz // t12 + sqsub v29\sz, v29\sz, v27\sz // t13 + + smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a + smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a + sqrshrn_sz v21, v4, v5, #12, \sz // t9a + sqrshrn_sz v27, v6, v7, #12, \sz // t14a + + smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a + smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a + sqrshrn_sz v29, v4, v5, #12, \sz // t13a + neg v6.4s, v6.4s +.ifc \sz, .8h + neg v7.4s, v7.4s +.endif + sqrshrn_sz v23, v6, v7, #12, \sz // t10a + + sqsub v2\sz, v17\sz, v19\sz // t11a + sqadd v17\sz, v17\sz, v19\sz // t8a + sqsub v3\sz, v31\sz, v25\sz // t12a + sqadd v31\sz, v31\sz, v25\sz // t15a + sqadd v19\sz, v21\sz, v23\sz // t9 + sqsub v21\sz, v21\sz, v23\sz // t10 + sqsub v25\sz, v27\sz, v29\sz // t13 + sqadd v27\sz, v27\sz, v29\sz // t14 + + smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11 + smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 + smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a + + sqrshrn_sz v4, v4, v5, #12, \sz // t11 + sqrshrn_sz v5, v6, v7, #12, \sz // t12 + smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a + sqrshrn_sz v2, v2, v3, #12, \sz // t10a + sqrshrn_sz v3, v6, v7, #12, \sz // t13a + + sqadd v6\sz, v16\sz, v31\sz // out0 + sqsub v31\sz, v16\sz, v31\sz // out15 + mov v16\szb, v6\szb + sqadd v23\sz, v30\sz, v17\sz // out7 + sqsub v7\sz, v30\sz, v17\sz // out8 + sqadd v17\sz, v18\sz, v27\sz // out1 + sqsub v30\sz, v18\sz, v27\sz // out14 + sqadd v18\sz, v20\sz, v3\sz // out2 + sqsub v29\sz, v20\sz, v3\sz // out13 + sqadd v3\sz, v28\sz, v19\sz // out6 + sqsub v25\sz, v28\sz, v19\sz // out9 + sqadd v19\sz, v22\sz, v5\sz // out3 + sqsub v28\sz, v22\sz, v5\sz // out12 + sqadd v20\sz, v24\sz, v4\sz // out4 + sqsub v27\sz, v24\sz, v4\sz // out11 + sqadd v21\sz, v26\sz, v2\sz // out5 + sqsub v26\sz, v26\sz, v2\sz // out10 + mov v24\szb, v7\szb + mov v22\szb, v3\szb +.endm + +function inv_dct_8h_x16_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h, v1.8h}, [x16] + idct_16 .8h, .16b + ret +endfunc + +function inv_dct_4h_x16_neon, export=1 + movrel x16, idct_coeffs + ld1 {v0.8h, v1.8h}, [x16] + idct_16 .4h, .8b + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb + movrel x16, iadst16_coeffs + ld1 {v0.8h, v1.8h}, [x16] + movrel x16, idct_coeffs + + smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 + smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 + smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 + sqrshrn_sz v16, v2, v3, #12, \sz // t0 + sqrshrn_sz v31, v4, v5, #12, \sz // t1 + smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 + smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 + sqrshrn_sz v18, v6, v7, #12, \sz // t2 + sqrshrn_sz v29, v2, v3, #12, \sz // t3 + smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 + smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 + sqrshrn_sz v20, v4, v5, #12, \sz // t4 + sqrshrn_sz v27, v6, v7, #12, \sz // t5 + smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 + smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 + sqrshrn_sz v22, v2, v3, #12, \sz // t6 + sqrshrn_sz v25, v4, v5, #12, \sz // t7 + smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 + smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 + sqrshrn_sz v23, v6, v7, #12, \sz // t8 + sqrshrn_sz v24, v2, v3, #12, \sz // t9 + smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 + smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 + sqrshrn_sz v21, v4, v5, #12, \sz // t10 + sqrshrn_sz v26, v6, v7, #12, \sz // t11 + smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 + smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 + sqrshrn_sz v19, v2, v3, #12, \sz // t12 + sqrshrn_sz v28, v4, v5, #12, \sz // t13 + smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 + sqrshrn_sz v17, v6, v7, #12, \sz // t14 + sqrshrn_sz v30, v2, v3, #12, \sz // t15 + + ld1 {v0.8h}, [x16] + + sqsub v2\sz, v16\sz, v23\sz // t8a + sqadd v16\sz, v16\sz, v23\sz // t0a + sqsub v3\sz, v31\sz, v24\sz // t9a + sqadd v31\sz, v31\sz, v24\sz // t1a + sqadd v23\sz, v18\sz, v21\sz // t2a + sqsub v18\sz, v18\sz, v21\sz // t10a + sqadd v24\sz, v29\sz, v26\sz // t3a + sqsub v29\sz, v29\sz, v26\sz // t11a + sqadd v21\sz, v20\sz, v19\sz // t4a + sqsub v20\sz, v20\sz, v19\sz // t12a + sqadd v26\sz, v27\sz, v28\sz // t5a + sqsub v27\sz, v27\sz, v28\sz // t13a + sqadd v19\sz, v22\sz, v17\sz // t6a + sqsub v22\sz, v22\sz, v17\sz // t14a + sqadd v28\sz, v25\sz, v30\sz // t7a + sqsub v25\sz, v25\sz, v30\sz // t15a + + smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 + smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 + smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 + sqrshrn_sz v17, v4, v5, #12, \sz // t8 + sqrshrn_sz v30, v6, v7, #12, \sz // t9 + smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 + smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 + sqrshrn_sz v18, v2, v3, #12, \sz // t10 + sqrshrn_sz v29, v4, v5, #12, \sz // t11 + smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 + smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 + sqrshrn_sz v27, v6, v7, #12, \sz // t12 + sqrshrn_sz v20, v2, v3, #12, \sz // t13 + smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 + sqrshrn_sz v25, v4, v5, #12, \sz // t14 + sqrshrn_sz v22, v6, v7, #12, \sz // t15 + + sqsub v2\sz, v16\sz, v21\sz // t4 + sqadd v16\sz, v16\sz, v21\sz // t0 + sqsub v3\sz, v31\sz, v26\sz // t5 + sqadd v31\sz, v31\sz, v26\sz // t1 + sqadd v21\sz, v23\sz, v19\sz // t2 + sqsub v23\sz, v23\sz, v19\sz // t6 + sqadd v26\sz, v24\sz, v28\sz // t3 + sqsub v24\sz, v24\sz, v28\sz // t7 + sqadd v19\sz, v17\sz, v27\sz // t8a + sqsub v17\sz, v17\sz, v27\sz // t12a + sqadd v28\sz, v30\sz, v20\sz // t9a + sqsub v30\sz, v30\sz, v20\sz // t13a + sqadd v27\sz, v18\sz, v25\sz // t10a + sqsub v18\sz, v18\sz, v25\sz // t14a + sqadd v20\sz, v29\sz, v22\sz // t11a + sqsub v29\sz, v29\sz, v22\sz // t15a + + smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a + smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a + smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a + sqrshrn_sz v22, v4, v5, #12, \sz // t4a + sqrshrn_sz v25, v6, v7, #12, \sz // t5a + smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a + smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 + sqrshrn_sz v24, v2, v3, #12, \sz // t6a + sqrshrn_sz v23, v4, v5, #12, \sz // t7a + smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 + smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 + sqrshrn_sz v17, v6, v7, #12, \sz // t12 + smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 + sqrshrn_sz v29, v2, v3, #12, \sz // t13 + sqrshrn_sz v30, v4, v5, #12, \sz // t14 + sqrshrn_sz v18, v6, v7, #12, \sz // t15 + + sqsub v2\sz, v16\sz, v21\sz // t2a +.ifc \o0, v16 + sqadd \o0\sz, v16\sz, v21\sz // out0 + sqsub v21\sz, v31\sz, v26\sz // t3a + sqadd \o15\sz, v31\sz, v26\sz // out15 +.else + sqadd v4\sz, v16\sz, v21\sz // out0 + sqsub v21\sz, v31\sz, v26\sz // t3a + sqadd \o15\sz, v31\sz, v26\sz // out15 + mov \o0\szb, v4\szb +.endif + sqneg \o15\sz, \o15\sz // out15 + + sqsub v3\sz, v29\sz, v18\sz // t15a + sqadd \o13\sz, v29\sz, v18\sz // out13 + sqadd \o2\sz, v17\sz, v30\sz // out2 + sqsub v26\sz, v17\sz, v30\sz // t14a + sqneg \o13\sz, \o13\sz // out13 + + sqadd \o1\sz, v19\sz, v27\sz // out1 + sqsub v27\sz, v19\sz, v27\sz // t10 + sqadd \o14\sz, v28\sz, v20\sz // out14 + sqsub v20\sz, v28\sz, v20\sz // t11 + sqneg \o1\sz, \o1\sz // out1 + + sqadd \o3\sz, v22\sz, v24\sz // out3 + sqsub v22\sz, v22\sz, v24\sz // t6 + sqadd \o12\sz, v25\sz, v23\sz // out12 + sqsub v23\sz, v25\sz, v23\sz // t7 + sqneg \o3\sz, \o3\sz // out3 + + smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23) + smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) + smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) + + sqrshrn_sz v24, v24, v25, #12, \sz // out8 + sqrshrn_sz v4, v4, v5, #12, \sz // out7 + sqrshrn_sz v5, v6, v7, #12, \sz // out5 + smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) + smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) + sqrshrn_sz v26, v6, v7, #12, \sz // out10 + + smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) + smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) + smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) + + sqrshrn_sz \o4, v2, v3, #12, \sz // out4 + sqrshrn_sz v6, v6, v7, #12, \sz // out11 + sqrshrn_sz v7, v21, v25, #12, \sz // out9 + sqrshrn_sz \o6, v22, v23, #12, \sz // out6 + +.ifc \o8, v23 + mov \o8\szb, v24\szb + mov \o10\szb, v26\szb +.endif + + sqneg \o7\sz, v4\sz // out7 + sqneg \o5\sz, v5\sz // out5 + sqneg \o11\sz, v6\sz // out11 + sqneg \o9\sz, v7\sz // out9 +.endm + +function inv_adst_8h_x16_neon, export=1 + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b + ret +endfunc + +function inv_flipadst_8h_x16_neon, export=1 + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b + ret +endfunc + +function inv_adst_4h_x16_neon, export=1 + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b + ret +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b + ret +endfunc + +function inv_identity_8h_x16_neon, export=1 + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.8h, v\i\().8h, v0.h[0] + sqadd v\i\().8h, v\i\().8h, v\i\().8h + sqadd v\i\().8h, v\i\().8h, v2.8h +.endr + ret +endfunc + +function inv_identity_4h_x16_neon, export=1 + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4h, v\i\().4h, v0.h[0] + sqadd v\i\().4h, v\i\().4h, v\i\().4h + sqadd v\i\().4h, v\i\().4h, v2.4h +.endr + ret +endfunc + +.macro identity_8x16_shift2 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + sqrdmulh v2.8h, \i, \c + sshr v2.8h, v2.8h, #1 + srhadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x16_shift1 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + sqrdmulh v2.8h, \i, \c + srshr v2.8h, v2.8h, #1 + sqadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x8_shift1 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + sqrdmulh v2.8h, \i, \c + srshr v2.8h, v2.8h, #1 + sqadd \i, \i, v2.8h +.endr +.endm + +.macro identity_8x8 c +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + sqrdmulh v2.8h, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v2.8h +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x8_neon + AARCH64_VALID_CALL_TARGET + mov x14, x30 + movi v7.8h, #0 +.if \identity + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.elseif \scale + mov w16, #2896*8 + dup v0.4h, w16 +.endif +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr +.if \scale + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif +.if \identity + identity_8x16_shift2 v0.h[0] +.else + blr x4 +.endif +.if \shift > 0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + srshr \i, \i, #\shift +.endr +.endif + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 + +.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h + st1 {\i}, [x6], #16 +.endr + + ret x14 +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=0, suffix=_identity + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + ret x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 +.irp i, 0, 8 + add x6, sp, #(\i*16*2) +.if \i == 8 + cmp w3, w13 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #16*2 + blr x9 +.endr + b 2f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr +2: +.irp i, 0, 8 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + ret x15 +endfunc + +.macro def_fn_16x16 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif +.ifc \txfm1, identity + adr x9, inv_txfm_horz_identity_16x8_neon +.else + adr x9, inv_txfm_horz_16x8_neon + adr x4, inv_\txfm1\()_8h_x16_neon +.endif + adr x5, inv_\txfm2\()_8h_x16_neon + mov x13, #\eob_half + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct, 36 +def_fn_16x16 identity, identity, 36 +def_fn_16x16 dct, adst, 36 +def_fn_16x16 dct, flipadst, 36 +def_fn_16x16 dct, identity, 8 +def_fn_16x16 adst, dct, 36 +def_fn_16x16 adst, adst, 36 +def_fn_16x16 adst, flipadst, 36 +def_fn_16x16 flipadst, dct, 36 +def_fn_16x16 flipadst, adst, 36 +def_fn_16x16 flipadst, flipadst, 36 +def_fn_16x16 identity, dct, 8 + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + mov x15, x30 + movi v4.8h, #0 + +.ifc \variant, identity_ +.irp i, v16.4h, v17.4h, v18.4h, v19.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr +.irp i, v16.d, v17.d, v18.d, v19.d + ld1 {\i}[1], [x2] + st1 {v4.4h}, [x2], #8 +.endr + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr +.irp i, v20.d, v21.d, v22.d, v23.d + ld1 {\i}[1], [x2] + st1 {v4.4h}, [x2], #8 +.endr + + identity_8x16_shift1 v0.h[0] +.else +.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h + ld1 {\i}, [x2] + st1 {v4.4h}, [x2], #8 +.endr + + blr x4 + + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + srshr \i, \i, #1 +.endr +.endif + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + +.ifc \variant, identity_ + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b +.else + ins v24.d[1], v28.d[0] + ins v25.d[1], v29.d[0] + ins v26.d[1], v30.d[0] + ins v27.d[1], v31.d[0] + srshr v16.8h, v24.8h, #1 + srshr v17.8h, v25.8h, #1 + srshr v18.8h, v26.8h, #1 + srshr v19.8h, v27.8h, #1 +.endif + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #8 + load_add_store_8x4 x6, x7 + + ret x15 +endfunc + +function inv_txfm_\variant\()add_4x16_neon + mov x15, x30 + movi v2.8h, #0 + + mov x11, #32 + cmp w3, w13 + b.lt 1f + + add x6, x2, #16 +.ifc \variant, identity_ +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + ld1 {\i}, [x6] + st1 {v2.8h}, [x6], x11 +.endr + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + identity_8x4_shift1 v24, v25, v26, v27, v0.h[0] +.else +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + ld1 {\i}, [x6] + st1 {v2.8h}, [x6], x11 +.endr + blr x4 + srshr v24.8h, v16.8h, #1 + srshr v25.8h, v17.8h, #1 + srshr v26.8h, v18.8h, #1 + srshr v27.8h, v19.8h, #1 +.endif + transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7 + ins v28.d[0], v24.d[1] + ins v29.d[0], v25.d[1] + ins v30.d[0], v26.d[1] + ins v31.d[0], v27.d[1] + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + movi v2.8h, #0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + ld1 {\i}, [x2] + st1 {v2.8h}, [x2], x11 +.endr +.ifc \variant, identity_ + mov w16, #(5793-4096)*8 + dup v0.4h, w16 + identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] +.else + blr x4 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h + srshr \i, \i, #1 +.endr +.endif + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + ins v20.d[0], v16.d[1] + ins v21.d[0], v17.d[1] + ins v22.d[0], v18.d[1] + ins v23.d[0], v19.d[1] + + blr x5 + + load_add_store_4x16 x0, x6 + + ret x15 +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_4h_x\h\()_neon + mov w13, #\eob_half +.else + adr x4, inv_\txfm1\()_4h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + mov x15, x30 + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x2] + st1 {v4.8h}, [x2], #16 +.endr + + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.ifc \variant, identity_ + mov w16, #2*(5793-4096)*8 + dup v0.4h, w16 + identity_8x16_shift1 v0.h[0] +.else + blr x4 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + srshr \i, \i, #1 +.endr +.endif + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + +.ifc \variant, identity_ + mov v16.16b, v24.16b + mov v17.16b, v25.16b + mov v18.16b, v26.16b + mov v19.16b, v27.16b + mov v20.16b, v28.16b + mov v21.16b, v29.16b + mov v22.16b, v30.16b + mov v23.16b, v31.16b +.else + srshr v16.8h, v24.8h, #1 + srshr v17.8h, v25.8h, #1 + srshr v18.8h, v26.8h, #1 + srshr v19.8h, v27.8h, #1 + srshr v20.8h, v28.8h, #1 + srshr v21.8h, v29.8h, #1 + srshr v22.8h, v30.8h, #1 + srshr v23.8h, v31.8h, #1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + add x0, x0, #8 + load_add_store_8x8 x0, x7 + + ret x15 +endfunc + +function inv_txfm_\variant\()add_8x16_neon + mov x15, x30 + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + mov x11, #32 + + cmp w3, w13 + b.lt 1f + + add x6, x2, #16 +.ifc \variant, identity_ +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x6] + st1 {v4.8h}, [x6], x11 +.endr + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 + // The identity shl #1 and downshift srshr #1 cancel out +.else +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x6] + st1 {v4.8h}, [x6], x11 +.endr + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + srshr v24.8h, v16.8h, #1 + srshr v25.8h, v17.8h, #1 + srshr v26.8h, v18.8h, #1 + srshr v27.8h, v19.8h, #1 + srshr v28.8h, v20.8h, #1 + srshr v29.8h, v21.8h, #1 + srshr v30.8h, v22.8h, #1 + srshr v31.8h, v23.8h, #1 +.endif + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + movi v4.8h, #0 + mov w16, #2896*8 + dup v0.4h, w16 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v4.8h}, [x2], x11 +.endr + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blr x4 + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + srshr \i, \i, #1 +.endr +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + + blr x5 + + load_add_store_8x16 x0, x6 + + ret x15 +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon +.if \w == 8 + mov x13, #\eob_half +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43 +def_fn_816 \w, \h, identity, identity, 43 +def_fn_816 \w, \h, dct, adst, 43 +def_fn_816 \w, \h, dct, flipadst, 43 +def_fn_816 \w, \h, dct, identity, 8 +def_fn_816 \w, \h, adst, dct, 43 +def_fn_816 \w, \h, adst, adst, 43 +def_fn_816 \w, \h, adst, flipadst, 43 +def_fn_816 \w, \h, flipadst, dct, 43 +def_fn_816 \w, \h, flipadst, adst, 43 +def_fn_816 \w, \h, flipadst, flipadst, 43 +def_fn_816 \w, \h, identity, dct, 64 +def_fn_816 \w, \h, adst, identity, 8 +def_fn_816 \w, \h, flipadst, identity, 8 +def_fn_816 \w, \h, identity, adst, 64 +def_fn_816 \w, \h, identity, flipadst, 64 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_8h_x16_neon, export=1 + movrel x16, idct_coeffs, 2*16 + ld1 {v0.8h, v1.8h}, [x16] + sub x16, x16, #2*16 + + smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a + smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a + smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a + sqrshrn_sz v16, v2, v3, #12, .8h // t16a + sqrshrn_sz v31, v4, v5, #12, .8h // t31a + smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a + smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a + sqrshrn_sz v24, v6, v7, #12, .8h // t17a + sqrshrn_sz v23, v2, v3, #12, .8h // t30a + smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a + smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a + sqrshrn_sz v20, v4, v5, #12, .8h // t18a + sqrshrn_sz v27, v6, v7, #12, .8h // t29a + smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a + smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a + sqrshrn_sz v28, v2, v3, #12, .8h // t19a + sqrshrn_sz v19, v4, v5, #12, .8h // t28a + smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a + smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a + sqrshrn_sz v18, v6, v7, #12, .8h // t20a + sqrshrn_sz v29, v2, v3, #12, .8h // t27a + smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a + smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a + sqrshrn_sz v26, v4, v5, #12, .8h // t21a + sqrshrn_sz v21, v6, v7, #12, .8h // t26a + smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a + smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a + sqrshrn_sz v22, v2, v3, #12, .8h // t22a + sqrshrn_sz v25, v4, v5, #12, .8h // t25a + smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a + sqrshrn_sz v30, v6, v7, #12, .8h // t23a + sqrshrn_sz v17, v2, v3, #12, .8h // t24a + + ld1 {v0.8h}, [x16] + + sqsub v2.8h, v16.8h, v24.8h // t17 + sqadd v16.8h, v16.8h, v24.8h // t16 + sqsub v3.8h, v31.8h, v23.8h // t30 + sqadd v31.8h, v31.8h, v23.8h // t31 + sqsub v24.8h, v28.8h, v20.8h // t18 + sqadd v28.8h, v28.8h, v20.8h // t19 + sqadd v23.8h, v18.8h, v26.8h // t20 + sqsub v18.8h, v18.8h, v26.8h // t21 + sqsub v20.8h, v30.8h, v22.8h // t22 + sqadd v30.8h, v30.8h, v22.8h // t23 + sqadd v26.8h, v17.8h, v25.8h // t24 + sqsub v17.8h, v17.8h, v25.8h // t25 + sqsub v22.8h, v29.8h, v21.8h // t26 + sqadd v29.8h, v29.8h, v21.8h // t27 + sqadd v25.8h, v19.8h, v27.8h // t28 + sqsub v19.8h, v19.8h, v27.8h // t29 + + smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a + smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a + smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a + sqrshrn_sz v21, v4, v5, #12, .8h // t17a + sqrshrn_sz v27, v6, v7, #12, .8h // t30a + neg v2.4s, v2.4s // -> t18a + neg v3.4s, v3.4s // -> t18a + smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a + smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a + sqrshrn_sz v19, v2, v3, #12, .8h // t18a + sqrshrn_sz v24, v4, v5, #12, .8h // t29a + smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a + smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a + sqrshrn_sz v22, v6, v7, #12, .8h // t21a + sqrshrn_sz v18, v2, v3, #12, .8h // t26a + neg v4.4s, v4.4s // -> t22a + neg v5.4s, v5.4s // -> t22a + smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a + sqrshrn_sz v17, v4, v5, #12, .8h // t22a + sqrshrn_sz v20, v6, v7, #12, .8h // t25a + + sqsub v2.8h, v27.8h, v24.8h // t29 + sqadd v27.8h, v27.8h, v24.8h // t30 + sqsub v3.8h, v21.8h, v19.8h // t18 + sqadd v21.8h, v21.8h, v19.8h // t17 + sqsub v24.8h, v16.8h, v28.8h // t19a + sqadd v16.8h, v16.8h, v28.8h // t16a + sqsub v19.8h, v30.8h, v23.8h // t20a + sqadd v30.8h, v30.8h, v23.8h // t23a + sqsub v28.8h, v17.8h, v22.8h // t21 + sqadd v17.8h, v17.8h, v22.8h // t22 + sqadd v23.8h, v26.8h, v29.8h // t24a + sqsub v26.8h, v26.8h, v29.8h // t27a + sqadd v22.8h, v20.8h, v18.8h // t25 + sqsub v20.8h, v20.8h, v18.8h // t26 + sqsub v29.8h, v31.8h, v25.8h // t28a + sqadd v31.8h, v31.8h, v25.8h // t31a + + smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a + smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a + smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 + sqrshrn_sz v18, v4, v5, #12, .8h // t18a + sqrshrn_sz v25, v6, v7, #12, .8h // t29a + smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 + smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 + sqrshrn_sz v29, v2, v3, #12, .8h // t19 + sqrshrn_sz v24, v4, v5, #12, .8h // t28 + neg v6.4s, v6.4s // -> t20 + neg v7.4s, v7.4s // -> t20 + smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 + smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a + sqrshrn_sz v26, v6, v7, #12, .8h // t20 + sqrshrn_sz v19, v2, v3, #12, .8h // t27 + neg v4.4s, v4.4s // -> t21a + neg v5.4s, v5.4s // -> t21a + smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a + sqrshrn_sz v20, v4, v5, #12, .8h // t21a + sqrshrn_sz v28, v6, v7, #12, .8h // t26a + + sqsub v2.8h, v16.8h, v30.8h // t23 + sqadd v16.8h, v16.8h, v30.8h // t16 = out16 + sqsub v3.8h, v31.8h, v23.8h // t24 + sqadd v31.8h, v31.8h, v23.8h // t31 = out31 + sqsub v23.8h, v21.8h, v17.8h // t22a + sqadd v17.8h, v21.8h, v17.8h // t17a = out17 + sqadd v30.8h, v27.8h, v22.8h // t30a = out30 + sqsub v21.8h, v27.8h, v22.8h // t25a + sqsub v27.8h, v18.8h, v20.8h // t21 + sqadd v18.8h, v18.8h, v20.8h // t18 = out18 + sqadd v4.8h, v29.8h, v26.8h // t19a = out19 + sqsub v26.8h, v29.8h, v26.8h // t20a + sqadd v29.8h, v25.8h, v28.8h // t29 = out29 + sqsub v25.8h, v25.8h, v28.8h // t26 + sqadd v28.8h, v24.8h, v19.8h // t28a = out28 + sqsub v24.8h, v24.8h, v19.8h // t27a + mov v19.16b, v4.16b // out19 + + smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 + smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 + sqrshrn_sz v20, v4, v5, #12, .8h // t20 + sqrshrn_sz v22, v6, v7, #12, .8h // t27 + + smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a + smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a + mov v27.16b, v22.16b // t27 + sqrshrn_sz v26, v4, v5, #12, .8h // t26a + + smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 + smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 + sqrshrn_sz v21, v6, v7, #12, .8h // t21a + sqrshrn_sz v22, v24, v25, #12, .8h // t22 + sqrshrn_sz v25, v4, v5, #12, .8h // t25 + + smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a + smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a + sqrshrn_sz v23, v4, v5, #12, .8h // t23a + sqrshrn_sz v24, v6, v7, #12, .8h // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x8_neon + mov x14, x30 + movi v7.8h, #0 + lsl x8, x8, #1 +.if \scale + mov w16, #2896*8 + dup v0.4h, w16 +.endif + +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_8h_x16_neon + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 + +.macro store1 r0, r1 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + add x6, x6, #32 +.endm + store1 v16.8h, v24.8h + store1 v17.8h, v25.8h + store1 v18.8h, v26.8h + store1 v19.8h, v27.8h + store1 v20.8h, v28.8h + store1 v21.8h, v29.8h + store1 v22.8h, v30.8h + store1 v23.8h, v31.8h +.purgem store1 + sub x6, x6, #64*8 + + movi v7.8h, #0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] + st1 {v7.8h}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.h[1] + scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_8h_x16_neon + transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 + transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 +.macro store2 r0, r1, shift + ld1 {v4.8h, v5.8h}, [x6] + sqsub v7.8h, v4.8h, \r0 + sqsub v6.8h, v5.8h, \r1 + sqadd v4.8h, v4.8h, \r0 + sqadd v5.8h, v5.8h, \r1 + rev64 v6.8h, v6.8h + rev64 v7.8h, v7.8h + srshr v4.8h, v4.8h, #\shift + srshr v5.8h, v5.8h, #\shift + srshr v6.8h, v6.8h, #\shift + srshr v7.8h, v7.8h, #\shift + ext v6.16b, v6.16b, v6.16b, #8 + st1 {v4.8h, v5.8h}, [x6], #32 + ext v7.16b, v7.16b, v7.16b, #8 + st1 {v6.8h, v7.8h}, [x6], #32 +.endm + + store2 v31.8h, v23.8h, \shift + store2 v30.8h, v22.8h, \shift + store2 v29.8h, v21.8h, \shift + store2 v28.8h, v20.8h, \shift + store2 v27.8h, v19.8h, \shift + store2 v26.8h, v18.8h, \shift + store2 v25.8h, v17.8h, \shift + store2 v24.8h, v16.8h, \shift +.purgem store2 + ret x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl inv_dct_8h_x16_neon + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl inv_dct32_odd_8h_x16_neon + + neg x9, x8 + mov x10, x6 +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8b}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8b}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8b}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + uaddw v5.8h, v5.8h, v2.8b + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + sqxtun v2.8b, v5.8h + ld1 {v5.8h}, [x7], \stride + uaddw v6.8h, v6.8h, v3.8b + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8b}, [x6], x1 + ld1 {v2.8b}, [x10], x1 + sqxtun v3.8b, v6.8h + uaddw v7.8h, v7.8h, v4.8b + srshr v5.8h, v5.8h, #4 + st1 {v3.8b}, [x6], x1 + sqxtun v4.8b, v7.8h + uaddw v5.8h, v5.8h, v2.8b + st1 {v4.8b}, [x6], x1 + sqxtun v2.8b, v5.8h + st1 {v2.8b}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + ret x14 +endfunc + +const eob_32x32 + .short 36, 136, 300, 1024 +endconst + +const eob_16x32 + .short 36, 151, 279, 512 +endconst + +const eob_16x32_shortside + .short 36, 512 +endconst + +const eob_8x32 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + movi v0.8h, #0 + movrel x13, eob_32x32 + + mov x8, #2*32 +1: + mov w9, #0 + movrel x12, eob_32x32 +2: + add w9, w9, #8 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #2 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #2*8 + b 1b +9: + ret +endfunc + +.macro shift_8_regs op, shift +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + mov w16, #2896*8 + mov w17, #2*(5793-4096)*8 + dup v1.4h, w16 + movi v0.8h, #0 + mov v1.h[1], w17 + movrel x13, eob_16x32\hshort + + mov x8, #2*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort +2: + add w9, w9, #8 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 v1.h[1] +.else + // 32x16 + shift_8_regs sqshl, 1 + identity_8x8 v1.h[1] +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #2 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #2*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + movi v0.8h, #0 + movrel x13, eob_8x32 + + mov w8, #2*\h +1: + ldrh w12, [x13], #2 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h + ld1 {\i}, [x2] + st1 {v0.8h}, [x2], x8 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs srshr, 1 +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #2*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 8, 16, 24 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_8h_x16_neon + +.irp i, 0, 8, 16, 24 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #2*32 + bl inv_txfm_horz_scale_16x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #8 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + adr x5, inv_dct_8h_x16_neon + +.irp i, 0, 8 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, #36 + b.lt 1f +.endif + mov x8, #2*16 + bl inv_txfm_horz_scale_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.8h, #0 + mov x8, #2*32 + mov w9, #32 + mov x6, sp +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v28.8h}, [x2], x8 +.endr + ldrh w12, [x13], #2 + sub x2, x2, x8, lsl #3 + sub w9, w9, #8 + add x2, x2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + srshr v\i\().8h, v\i\().8h, #2 +.endr + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + cmp w3, w12 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #8 +.rept 2 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 +.endr + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + + mov x6, sp + mov x7, x2 + mov x8, #8*2 + bl inv_txfm_horz_dct_32x8_neon + + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl inv_dct_8h_x8_neon + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + ret x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.8h, v1.8h}, [x17], #32 + + sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a + sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a + sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a + sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a + sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a + sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a + sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a + sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a + + sqadd v24.8h, v16.8h, v17.8h // t32 + sqsub v25.8h, v16.8h, v17.8h // t33 + sqsub v26.8h, v19.8h, v18.8h // t34 + sqadd v27.8h, v19.8h, v18.8h // t35 + sqadd v28.8h, v20.8h, v21.8h // t60 + sqsub v29.8h, v20.8h, v21.8h // t61 + sqsub v30.8h, v23.8h, v22.8h // t62 + sqadd v31.8h, v23.8h, v22.8h // t63 + + smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a + smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a + neg v2.4s, v2.4s // t34a + neg v3.4s, v3.4s // t34a + smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a + sqrshrn_sz v26, v2, v3, #12, .8h // t34a + smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a + sqrshrn_sz v29, v4, v5, #12, .8h // t61a + sqrshrn_sz v25, v6, v7, #12, .8h // t33a + sqrshrn_sz v30, v2, v3, #12, .8h // t62a + + sqadd v16.8h, v24.8h, v27.8h // t32a + sqsub v19.8h, v24.8h, v27.8h // t35a + sqadd v17.8h, v25.8h, v26.8h // t33 + sqsub v18.8h, v25.8h, v26.8h // t34 + sqsub v20.8h, v31.8h, v28.8h // t60a + sqadd v23.8h, v31.8h, v28.8h // t63a + sqsub v21.8h, v30.8h, v29.8h // t61 + sqadd v22.8h, v30.8h, v29.8h // t62 + + smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a + smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a + smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 + sqrshrn_sz v21, v2, v3, #12, .8h // t61a + sqrshrn_sz v18, v4, v5, #12, .8h // t34a + smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 + sqrshrn_sz v20, v6, v7, #12, .8h // t60 + sqrshrn_sz v19, v2, v3, #12, .8h // t35 + + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4h}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #2*8*0] // t32a + ldr q17, [x9, #2*8*8] // t39a + ldr q18, [x9, #2*8*0] // t63a + ldr q19, [x6, #2*8*8] // t56a + ldr q20, [x6, #2*8*16] // t40a + ldr q21, [x9, #2*8*24] // t47a + ldr q22, [x9, #2*8*16] // t55a + ldr q23, [x6, #2*8*24] // t48a + + sqadd v24.8h, v16.8h, v17.8h // t32 + sqsub v25.8h, v16.8h, v17.8h // t39 + sqadd v26.8h, v18.8h, v19.8h // t63 + sqsub v27.8h, v18.8h, v19.8h // t56 + sqsub v28.8h, v21.8h, v20.8h // t40 + sqadd v29.8h, v21.8h, v20.8h // t47 + sqadd v30.8h, v23.8h, v22.8h // t48 + sqsub v31.8h, v23.8h, v22.8h // t55 + + smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a + smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a + smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a + sqrshrn_sz v25, v2, v3, #12, .8h // t56a + sqrshrn_sz v27, v4, v5, #12, .8h // t39a + neg v6.4s, v6.4s // t40a + neg v7.4s, v7.4s // t40a + smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a + sqrshrn_sz v31, v6, v7, #12, .8h // t40a + sqrshrn_sz v28, v2, v3, #12, .8h // t55a + + sqadd v16.8h, v24.8h, v29.8h // t32a + sqsub v19.8h, v24.8h, v29.8h // t47a + sqadd v17.8h, v27.8h, v31.8h // t39 + sqsub v18.8h, v27.8h, v31.8h // t40 + sqsub v20.8h, v26.8h, v30.8h // t48a + sqadd v23.8h, v26.8h, v30.8h // t63a + sqsub v21.8h, v25.8h, v28.8h // t55 + sqadd v22.8h, v25.8h, v28.8h // t56 + + smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a + smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a + smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 + sqrshrn_sz v18, v2, v3, #12, .8h // t40a + sqrshrn_sz v21, v4, v5, #12, .8h // t55a + smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 + sqrshrn_sz v19, v6, v7, #12, .8h // t47 + sqrshrn_sz v20, v2, v3, #12, .8h // t48 + + str q16, [x6, #2*8*0] // t32a + str q17, [x9, #2*8*0] // t39 + str q18, [x6, #2*8*8] // t40a + str q19, [x9, #2*8*8] // t47 + str q20, [x6, #2*8*16] // t48 + str q21, [x9, #2*8*16] // t55a + str q22, [x6, #2*8*24] // t56 + str q23, [x9, #2*8*24] // t63a + + add x6, x6, #2*8 + sub x9, x9, #2*8 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + mov \gpr, \val + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.8h, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_8h_x16_neon + + store16 x6 + + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.8h, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_8h_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.8h}, [x6], #16 + ld1 {v3.8h}, [x6], #16 + sqadd v6.8h, v2.8h, \r0 + sqsub \r0, v2.8h, \r0 + ld1 {v4.8h}, [x6], #16 + sqadd v7.8h, v3.8h, \r1 + sqsub \r1, v3.8h, \r1 + ld1 {v5.8h}, [x6], #16 + sqadd v2.8h, v4.8h, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.8h, \r2 + st1 {v6.8h}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.8h, v5.8h, \r3 + sqsub \r3, v5.8h, \r3 + st1 {v7.8h}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.8h}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.8h}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.8h, v30.8h, v29.8h, v28.8h + store_addsub v27.8h, v26.8h, v25.8h, v24.8h + store_addsub v23.8h, v22.8h, v21.8h, v20.8h + store_addsub v19.8h, v18.8h, v17.8h, v16.8h +.purgem store_addsub + + add x6, x6, #2*8*16 + + movrel x17, idct64_coeffs + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.8h}, [x7] // in1 (offset 0) + ld1 {v17.8h}, [x9] // in31 (offset 15) + ld1 {v18.8h}, [x10] // in17 (offset 8) + ld1 {v19.8h}, [x11] // in15 (offset 7) + st1_if {v7.8h}, [x7], \clear + st1_if {v7.8h}, [x9], \clear + st1_if {v7.8h}, [x10], \clear + st1_if {v7.8h}, [x11], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.8h}, [x10] // in7 (offset 3) + ld1 {v17.8h}, [x11] // in25 (offset 12) + ld1 {v18.8h}, [x9] // in23 (offset 11) + ld1 {v19.8h}, [x7] // in9 (offset 4) + st1_if {v7.8h}, [x7], \clear + st1_if {v7.8h}, [x9], \clear + st1_if {v7.8h}, [x10], \clear + st1_if {v7.8h}, [x11], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movdup_if v0.4h, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.h[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #2*8*32 + add x9, x6, #2*8*7 + + bl inv_dct64_step2_neon + + ret x14 +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x8_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-2*8*4 + + dup v7.8h, w12 +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.8h, \src0, \src1 + sqadd v0.8h, \src0, \src1 + sqsub v3.8h, \src2, \src3 + srshl v1.8h, v1.8h, v7.8h + sqadd v2.8h, \src2, \src3 + srshl v0.8h, v0.8h, v7.8h + srshl v3.8h, v3.8h, v7.8h + rev64 v1.8h, v1.8h + srshl v2.8h, v2.8h, v7.8h + rev64 v3.8h, v3.8h + ext v1.16b, v1.16b, v1.16b, #8 + st1 {v0.8h}, [x6], x10 + ext v3.16b, v3.16b, v3.16b, #8 + st1 {v1.8h}, [x9], x10 + st1 {v2.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.8h, v31.8h, v17.8h, v30.8h + store_addsub v18.8h, v29.8h, v19.8h, v28.8h + store_addsub v20.8h, v27.8h, v21.8h, v26.8h + store_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem store_addsub + sub x6, x6, x10, lsl #3 + sub x9, x9, x10, lsl #3 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + ret x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8b}, [x6], x1 + ld1 {v1.8b}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8b}, [x6] + sqsub v5.8h, \src0, \src1 + ld1 {v3.8b}, [x9] + sqadd v6.8h, \src2, \src3 + sqsub v7.8h, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr v6.8h, v6.8h, #4 + uaddw v4.8h, v4.8h, v0.8b + srshr v7.8h, v7.8h, #4 + uaddw v5.8h, v5.8h, v1.8b + uaddw v6.8h, v6.8h, v2.8b + sqxtun v0.8b, v4.8h + uaddw v7.8h, v7.8h, v3.8b + sqxtun v1.8b, v5.8h + st1 {v0.8b}, [x6], x1 + sqxtun v2.8b, v6.8h + st1 {v1.8b}, [x9], x10 + sqxtun v3.8b, v7.8h + st1 {v2.8b}, [x6], x1 + st1 {v3.8b}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + ret x14 +endfunc + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_8h_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_8h_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_scale_dct_32x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*8*2 + add x4, sp, #64*8*2 + + movrel x13, eob_16x32 + +.irp i, 0, 8 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*2) + mov x8, #16*2 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_8h_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x8_neon +.if \i < 8 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + adr x5, inv_dct_8h_x16_neon +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_8h_x16_neon +.irp i, 0, 8, 16, 24 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 24 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*2) + mov x8, #32*2 + bl inv_txfm_horz_16x8_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #8 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl inv_txfm_dct_8h_x64_neon + add x6, x0, #(\i) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + ret x15 +endfunc diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S new file mode 100644 index 0000000000..eee3a9636d --- /dev/null +++ b/third_party/dav1d/src/arm/64/itx16.S @@ -0,0 +1,3648 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, +// int bitdepth_max); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro mul_mla d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mla \d\().4s, \s1\().4s, \c1 +.endm + +.macro mul_mls d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mls \d\().4s, \s1\().4s, \c1 +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro smin_4s r0, r1, r2 + smin \r0\().4s, \r1\().4s, \r2\().4s +.endm +.macro smax_4s r0, r1, r2 + smax \r0\().4s, \r1\().4s, \r2\().4s +.endm + +.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + usqadd \adddst, \addsrc +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , \dst, \src + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src + load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store , , , , v27.8h, v26.8h, \dst, \src + load_add_store , , , , , v27.8h, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits + load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store , , , , , v5.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + usqadd \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src + load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src + load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src + load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src + load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src + load_add_store4 , , , , , , , v23.d, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src + load_add_store4 , , , , , , , v3.d, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v0.2s, w16 + sqrdmulh v20.4s, v16.4s, v0.s[0] + str wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v20.4s, v20.4s, v0.s[0] +.endif +.if \shift > 0 + sqrshrn v16.4h, v20.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift +.else + sqxtn v16.4h, v20.4s + sqxtn2 v16.8h, v20.4s +.endif + sqrdmulh v16.8h, v16.8h, v0.h[1] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + subs w4, w4, #4 + ld1 {v1.d}[1], [x0], x1 + usqadd v0.8h, v16.8h + sub x0, x0, x1, lsl #2 + usqadd v1.8h, v16.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h}, [x0], x1 + subs w4, w4, #4 + ld1 {v1.8h}, [x0], x1 + usqadd v0.8h, v16.8h + ld1 {v2.8h}, [x0], x1 + usqadd v1.8h, v16.8h + ld1 {v3.8h}, [x0], x1 + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + sub x0, x0, x1, lsl #2 + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + st1 {v0.8h}, [x0], x1 + smin v2.8h, v2.8h, v31.8h + st1 {v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h}, [x0], x1 + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x0], x1 + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h + sub x0, x0, x1, lsl #1 + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + st1 {v0.8h, v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w4, w4, #1 + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x1, x1, #64 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + subs w4, w4, #1 + usqadd v0.8h, v16.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] + usqadd v1.8h, v16.8h + sub x0, x0, #64 + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + usqadd v4.8h, v16.8h + usqadd v5.8h, v16.8h + usqadd v6.8h, v16.8h + usqadd v7.8h, v16.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + smin v6.8h, v6.8h, v31.8h + smin v7.8h, v7.8h, v31.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4s, v16.4s, v17.4s + sub v21.4s, v18.4s, v19.4s + sub v20.4s, v16.4s, v21.4s + sshr v20.4s, v20.4s, #1 + sub v18.4s, v20.4s, v17.4s + sub v17.4s, v20.4s, v19.4s + add v19.4s, v21.4s, v18.4s + sub v16.4s, v16.4s, v17.4s +.endm + +.macro idct_4 r0, r1, r2, r3 + mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] + mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] + mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] + srshr v6.4s, v6.4s, #12 + srshr v2.4s, v2.4s, #12 + srshr v7.4s, v4.4s, #12 + srshr v3.4s, v3.4s, #12 + sqadd \r0\().4s, v2.4s, v6.4s + sqsub \r3\().4s, v2.4s, v6.4s + sqadd \r1\().4s, v3.4s, v7.4s + sqsub \r2\().4s, v3.4s, v7.4s +.endm + +function inv_dct_4s_x4_neon + AARCH64_VALID_CALL_TARGET + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] + idct_4 v16, v17, v18, v19 + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.4s}, [x16] + + sub v3.4s, v16.4s, v18.4s + mul v4.4s, v16.4s, v0.s[0] + mla v4.4s, v18.4s, v0.s[1] + mla v4.4s, v19.4s, v0.s[2] + mul v7.4s, v17.4s, v0.s[3] + add v3.4s, v3.4s, v19.4s + mul v5.4s, v16.4s, v0.s[2] + mls v5.4s, v18.4s, v0.s[0] + mls v5.4s, v19.4s, v0.s[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[3] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + srshr \o0\().4s, \o0\().4s, #12 + srshr \o2\().4s, \o2\().4s, #12 + srshr \o1\().4s, \o1\().4s, #12 + srshr \o3\().4s, \o3\().4s, #12 +.endm + +function inv_adst_4s_x4_neon + AARCH64_VALID_CALL_TARGET + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4s_x4_neon + AARCH64_VALID_CALL_TARGET + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x4_neon + AARCH64_VALID_CALL_TARGET + movz w16, #(5793-4096)*8, lsl #16 + dup v0.2s, w16 + sqrdmulh v4.4s, v16.4s, v0.s[0] + sqrdmulh v5.4s, v17.4s, v0.s[0] + sqrdmulh v6.4s, v18.4s, v0.s[0] + sqrdmulh v7.4s, v19.4s, v0.s[0] + sqadd v16.4s, v16.4s, v4.4s + sqadd v17.4s, v17.4s, v5.4s + sqadd v18.4s, v18.4s, v6.4s + sqadd v19.4s, v19.4s, v7.4s + ret +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + mov x15, x30 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + sshr v16.4s, v16.4s, #2 + sshr v17.4s, v17.4s, #2 + sshr v18.4s, v18.4s, #2 + sshr v19.4s, v19.4s, #2 + + iwht4 + + st1 {v30.4s, v31.4s}, [x2], #32 + transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.d}[0], [x0], x1 + sqxtn v16.4h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqxtn2 v16.8h, v17.4s + ld1 {v1.d}[0], [x0], x1 + sqxtn v18.4h, v18.4s + ld1 {v1.d}[1], [x0], x1 + sqxtn2 v18.8h, v19.4s + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + blr x4 + + st1 {v30.4s, v31.4s}, [x2], #32 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.d}[0], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x0, x0, x1, lsl #2 + usqadd v0.8h, v16.8h + usqadd v1.8h, v18.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + + ret x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v4.2s, w16 + str wzr, [x2] + sqrdmulh v16.4s, v16.4s, v4.s[0] + ld1 {v0.d}[0], [x0], x1 + sqxtn v20.4h, v16.4s + sqxtn2 v20.8h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqrdmulh v20.8h, v20.8h, v4.h[1] + ld1 {v1.d}[0], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.d}[1], [x0], x1 + srshr v18.8h, v20.8h, #4 + movi v30.8h, #0 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4s_x4_neon + movrel x5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4 \r0, \r2, \r4, \r6 + + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + smin_4s \r, \r, v5 +.endr +.irp r, \r0, \r2, \r4, \r6 + smax_4s \r, \r, v4 +.endr + + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a + mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a + mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a + srshr \r1\().4s, v2.4s, #12 // t4a + srshr \r7\().4s, v3.4s, #12 // t7a + srshr \r3\().4s, v6.4s, #12 // t5a + srshr \r5\().4s, v7.4s, #12 // t6a + + sqadd v2.4s, \r1\().4s, \r3\().4s // t4 + sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a + sqadd v3.4s, \r7\().4s, \r5\().4s // t7 + sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a + +.irp r, v2, \r1, v3, \r3 + smin_4s \r, \r, v5 +.endr +.irp r, v2, \r1, v3, \r3 + smax_4s \r, \r, v4 +.endr + + mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 + mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 + srshr v7.4s, v7.4s, #12 // t5 + srshr v6.4s, v6.4s, #12 // t6 + + sqsub \r7\().4s, \r0\().4s, v3.4s // out7 + sqadd \r0\().4s, \r0\().4s, v3.4s // out0 + sqadd \r1\().4s, \r2\().4s, v6.4s // out1 + sqsub v6.4s, \r2\().4s, v6.4s // out6 + sqadd \r2\().4s, \r4\().4s, v7.4s // out2 + sqsub \r5\().4s, \r4\().4s, v7.4s // out5 + sqadd \r3\().4s, \r6\().4s, v2.4s // out3 + sqsub \r4\().4s, \r6\().4s, v2.4s // out4 + mov \r6\().16b, v6.16b // out6 +.endm + +function inv_dct_4s_x8_neon + AARCH64_VALID_CALL_TARGET + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 + movrel x16, iadst8_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v23, v16, v0.s[0], v0.s[1] + mul_mls v4, v23, v16, v0.s[1], v0.s[0] + mul_mla v6, v21, v18, v0.s[2], v0.s[3] + srshr v16.4s, v2.4s, #12 // t0a + srshr v23.4s, v4.4s, #12 // t1a + mul_mls v2, v21, v18, v0.s[3], v0.s[2] + mul_mla v4, v19, v20, v1.s[0], v1.s[1] + srshr v18.4s, v6.4s, #12 // t2a + srshr v21.4s, v2.4s, #12 // t3a + mul_mls v6, v19, v20, v1.s[1], v1.s[0] + mul_mla v2, v17, v22, v1.s[2], v1.s[3] + srshr v20.4s, v4.4s, #12 // t4a + srshr v19.4s, v6.4s, #12 // t5a + mul_mls v4, v17, v22, v1.s[3], v1.s[2] + srshr v22.4s, v2.4s, #12 // t6a + srshr v17.4s, v4.4s, #12 // t7a + + ld1 {v0.4s}, [x16] + + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + + sqadd v2.4s, v16.4s, v20.4s // t0 + sqsub v3.4s, v16.4s, v20.4s // t4 + mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqadd v4.4s, v23.4s, v19.4s // t1 + sqsub v5.4s, v23.4s, v19.4s // t5 + sqadd v6.4s, v18.4s, v22.4s // t2 + sqsub v7.4s, v18.4s, v22.4s // t6 + sqadd v18.4s, v21.4s, v17.4s // t3 + sqsub v19.4s, v21.4s, v17.4s // t7 + +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smax_4s \r, \r, v20 +.endr + + mul_mla v16, v3, v5, v0.s[3], v0.s[2] + mul_mls v20, v3, v5, v0.s[2], v0.s[3] + mul_mls v22, v19, v7, v0.s[3], v0.s[2] + + srshr v3.4s, v16.4s, #12 // t4a + srshr v5.4s, v20.4s, #12 // t5a + + mul_mla v16, v19, v7, v0.s[2], v0.s[3] + + srshr v7.4s, v22.4s, #12 // t6a + srshr v19.4s, v16.4s, #12 // t7a + + sqadd \o0\().4s, v2.4s, v6.4s // out0 + sqsub v2.4s, v2.4s, v6.4s // t2 + sqadd \o7\().4s, v4.4s, v18.4s // out7 + sqsub v4.4s, v4.4s, v18.4s // t3 + + mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + + sqadd \o1\().4s, v3.4s, v7.4s // out1 + sqsub v3.4s, v3.4s, v7.4s // t6 + sqadd \o6\().4s, v5.4s, v19.4s // out6 + sqsub v5.4s, v5.4s, v19.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v4, v3, v5 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v4, v3, v5 + smax_4s \r, \r, v18 +.endr + + sqneg \o7\().4s, \o7\().4s // out7 + sqneg \o1\().4s, \o1\().4s // out1 + + mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) + mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) + mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) + srshr v2.4s, v18.4s, #12 // out3 + mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) + srshr v3.4s, v20.4s, #12 // out5 + srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) + srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) + + sqneg \o3\().4s, v2.4s // out3 + sqneg \o5\().4s, v3.4s // out5 +.endm + +function inv_adst_4s_x8_neon + AARCH64_VALID_CALL_TARGET + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +function inv_flipadst_4s_x8_neon + AARCH64_VALID_CALL_TARGET + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x8_neon + AARCH64_VALID_CALL_TARGET + sqshl v16.4s, v16.4s, #1 + sqshl v17.4s, v17.4s, #1 + sqshl v18.4s, v18.4s, #1 + sqshl v19.4s, v19.4s, #1 + sqshl v20.4s, v20.4s, #1 + sqshl v21.4s, v21.4s, #1 + sqshl v22.4s, v22.4s, #1 + sqshl v23.4s, v23.4s, #1 + ret +endfunc + +function inv_txfm_add_8x8_neon + movi v31.4s, #0 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + + blr x5 + + load_add_store_8x8 x0, x7 + ret x15 +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + movrel x5, X(inv_\txfm2\()_8h_x8_neon) + mov w13, #\eob_half + adr x4, inv_\txfm1\()_4s_x8_neon + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + sqxtn v20.4h, v20.4s + sqxtn v21.4h, v21.4s + sqxtn v22.4h, v22.4s + sqxtn v23.4h, v23.4s + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + ret x15 +endfunc + +function inv_txfm_add_4x8_neon + movz w16, #2896*8, lsl #16 + movi v31.4s, #0 + dup v30.2s, w16 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v20.4h, v16.4s + sqxtn v21.4h, v17.4s + sqxtn v22.4h, v18.4s + sqxtn v23.4h, v19.4s + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f + +1: +.irp i, v20, v21, v22, v23 + movi \i\().4h, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x8 x0, x7 + ret x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov w13, #\eob_half +.endif + movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +function inv_dct_4s_x16_neon + AARCH64_VALID_CALL_TARGET + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + + // idct_8 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smax \r\().4s, \r\().4s, v4.4s +.endr + + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #32 + + mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a + mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a + srshr v17.4s, v2.4s, #12 // t8a + srshr v31.4s, v3.4s, #12 // t15a + mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a + mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a + srshr v23.4s, v6.4s, #12 // t9a + srshr v25.4s, v2.4s, #12 // t14a + mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a + mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a + srshr v21.4s, v3.4s, #12 // t10a + srshr v27.4s, v6.4s, #12 // t13a + mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a + srshr v19.4s, v2.4s, #12 // t11a + srshr v29.4s, v3.4s, #12 // t12a + + ld1 {v0.4s}, [x16] + + sqsub v2.4s, v17.4s, v23.4s // t9 + sqadd v17.4s, v17.4s, v23.4s // t8 + sqsub v3.4s, v31.4s, v25.4s // t14 + sqadd v31.4s, v31.4s, v25.4s // t15 + sqsub v23.4s, v19.4s, v21.4s // t10 + sqadd v19.4s, v19.4s, v21.4s // t11 + sqadd v25.4s, v29.4s, v27.4s // t12 + sqsub v29.4s, v29.4s, v27.4s // t13 + +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a + mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a + srshr v21.4s, v7.4s, #12 // t9a + srshr v27.4s, v6.4s, #12 // t14a + + mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a + srshr v29.4s, v7.4s, #12 // t13a + neg v6.4s, v6.4s + srshr v23.4s, v6.4s, #12 // t10a + + sqsub v2.4s, v17.4s, v19.4s // t11a + sqadd v17.4s, v17.4s, v19.4s // t8a + sqsub v3.4s, v31.4s, v25.4s // t12a + sqadd v31.4s, v31.4s, v25.4s // t15a + sqadd v19.4s, v21.4s, v23.4s // t9 + sqsub v21.4s, v21.4s, v23.4s // t10 + sqsub v25.4s, v27.4s, v29.4s // t13 + sqadd v27.4s, v27.4s, v29.4s // t14 + +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 + mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a + + srshr v7.4s, v7.4s, #12 // t11 + srshr v6.4s, v6.4s, #12 // t12 + mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a + srshr v2.4s, v2.4s, #12 // t10a + srshr v3.4s, v3.4s, #12 // t13a + + sqadd v1.4s, v16.4s, v31.4s // out0 + sqsub v31.4s, v16.4s, v31.4s // out15 + mov v16.16b, v1.16b + sqadd v23.4s, v30.4s, v17.4s // out7 + sqsub v1.4s, v30.4s, v17.4s // out8 + sqadd v17.4s, v18.4s, v27.4s // out1 + sqsub v30.4s, v18.4s, v27.4s // out14 + sqadd v18.4s, v20.4s, v3.4s // out2 + sqsub v29.4s, v20.4s, v3.4s // out13 + sqadd v3.4s, v28.4s, v19.4s // out6 + sqsub v25.4s, v28.4s, v19.4s // out9 + sqadd v19.4s, v22.4s, v6.4s // out3 + sqsub v28.4s, v22.4s, v6.4s // out12 + sqadd v20.4s, v24.4s, v7.4s // out4 + sqsub v27.4s, v24.4s, v7.4s // out11 + sqadd v21.4s, v26.4s, v2.4s // out5 + sqsub v26.4s, v26.4s, v2.4s // out10 + mov v24.16b, v1.16b + mov v22.16b, v3.16b + + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel x16, iadst16_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 + mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 + srshr v16.4s, v2.4s, #12 // t0 + srshr v31.4s, v4.4s, #12 // t1 + mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 + mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 + srshr v18.4s, v6.4s, #12 // t2 + srshr v29.4s, v2.4s, #12 // t3 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 + mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 + srshr v20.4s, v4.4s, #12 // t4 + srshr v27.4s, v6.4s, #12 // t5 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 + ld1 {v0.4s, v1.4s}, [x16] + movrel x16, idct_coeffs + mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 + srshr v22.4s, v2.4s, #12 // t6 + srshr v25.4s, v4.4s, #12 // t7 + mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 + mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 + srshr v23.4s, v6.4s, #12 // t8 + srshr v24.4s, v2.4s, #12 // t9 + mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 + mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 + srshr v21.4s, v4.4s, #12 // t10 + srshr v26.4s, v6.4s, #12 // t11 + mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 + mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 + srshr v19.4s, v2.4s, #12 // t12 + srshr v28.4s, v4.4s, #12 // t13 + mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 + srshr v17.4s, v6.4s, #12 // t14 + srshr v30.4s, v2.4s, #12 // t15 + + ld1 {v0.4s, v1.4s}, [x16] + + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + + sqsub v2.4s, v16.4s, v23.4s // t8a + sqadd v16.4s, v16.4s, v23.4s // t0a + sqsub v3.4s, v31.4s, v24.4s // t9a + sqadd v31.4s, v31.4s, v24.4s // t1a + sqadd v23.4s, v18.4s, v21.4s // t2a + sqsub v18.4s, v18.4s, v21.4s // t10a + sqadd v24.4s, v29.4s, v26.4s // t3a + sqsub v29.4s, v29.4s, v26.4s // t11a + sqadd v21.4s, v20.4s, v19.4s // t4a + sqsub v20.4s, v20.4s, v19.4s // t12a + sqadd v26.4s, v27.4s, v28.4s // t5a + sqsub v27.4s, v27.4s, v28.4s // t13a + sqadd v19.4s, v22.4s, v17.4s // t6a + sqsub v22.4s, v22.4s, v17.4s // t14a + sqadd v28.4s, v25.4s, v30.4s // t7a + sqsub v25.4s, v25.4s, v30.4s // t15a + +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smax_4s \r, \r, v7 +.endr + + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 + mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 + mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 + srshr v17.4s, v4.4s, #12 // t8 + srshr v30.4s, v6.4s, #12 // t9 + mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 + srshr v18.4s, v2.4s, #12 // t10 + srshr v29.4s, v4.4s, #12 // t11 + mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 + srshr v27.4s, v6.4s, #12 // t12 + srshr v20.4s, v2.4s, #12 // t13 + mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 + srshr v25.4s, v4.4s, #12 // t14 + srshr v22.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t4 + sqadd v16.4s, v16.4s, v21.4s // t0 + sqsub v3.4s, v31.4s, v26.4s // t5 + sqadd v31.4s, v31.4s, v26.4s // t1 + sqadd v21.4s, v23.4s, v19.4s // t2 + sqsub v23.4s, v23.4s, v19.4s // t6 + sqadd v26.4s, v24.4s, v28.4s // t3 + sqsub v24.4s, v24.4s, v28.4s // t7 + sqadd v19.4s, v17.4s, v27.4s // t8a + sqsub v17.4s, v17.4s, v27.4s // t12a + sqadd v28.4s, v30.4s, v20.4s // t9a + sqsub v30.4s, v30.4s, v20.4s // t13a + sqadd v27.4s, v18.4s, v25.4s // t10a + sqsub v18.4s, v18.4s, v25.4s // t14a + sqadd v20.4s, v29.4s, v22.4s // t11a + sqsub v29.4s, v29.4s, v22.4s // t15a + +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smax_4s \r, \r, v7 +.endr + + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a + mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a + mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a + srshr v22.4s, v4.4s, #12 // t4a + srshr v25.4s, v6.4s, #12 // t5a + mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a + mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 + srshr v24.4s, v2.4s, #12 // t6a + srshr v23.4s, v4.4s, #12 // t7a + mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 + mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 + srshr v17.4s, v6.4s, #12 // t12 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 + srshr v29.4s, v2.4s, #12 // t13 + srshr v30.4s, v4.4s, #12 // t14 + srshr v18.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t2a +.ifc \o0, v16 + sqadd \o0\().4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 +.else + sqadd v4.4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 + mov \o0\().16b, v4.16b +.endif + + sqsub v3.4s, v29.4s, v18.4s // t15a + sqadd \o13\().4s, v29.4s, v18.4s // out13 + sqadd \o2\().4s, v17.4s, v30.4s // out2 + sqsub v26.4s, v17.4s, v30.4s // t14a + + sqadd \o1\().4s, v19.4s, v27.4s // out1 + sqsub v27.4s, v19.4s, v27.4s // t10 + sqadd \o14\().4s, v28.4s, v20.4s // out14 + sqsub v20.4s, v28.4s, v20.4s // t11 + + sqadd \o3\().4s, v22.4s, v24.4s // out3 + sqsub v22.4s, v22.4s, v24.4s // t6 + sqadd \o12\().4s, v25.4s, v23.4s // out12 + sqsub v23.4s, v25.4s, v23.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smax_4s \r, \r, v7 +.endr + + sqneg \o15\().4s, \o15\().4s // out15 + sqneg \o13\().4s, \o13\().4s // out13 + sqneg \o1\().4s, \o1\().4s // out1 + sqneg \o3\().4s, \o3\().4s // out3 + + mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) + mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) + mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) + + srshr v24.4s, v24.4s, #12 // out8 + srshr v4.4s, v4.4s, #12 // out7 + srshr v5.4s, v6.4s, #12 // out5 + mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) + mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) + srshr v26.4s, v6.4s, #12 // out10 + + mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) + mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) + mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) + + srshr \o4\().4s, v2.4s, #12 // out4 + srshr v6.4s, v6.4s, #12 // out11 + srshr v7.4s, v21.4s, #12 // out9 + srshr \o6\().4s, v22.4s, #12 // out6 + +.ifc \o8, v23 + mov \o8\().16b, v24.16b + mov \o10\().16b, v26.16b +.endif + + sqneg \o7\().4s, v4.4s // out7 + sqneg \o5\().4s, v5.4s // out5 + sqneg \o11\().4s, v6.4s // out11 + sqneg \o9\().4s, v7.4s // out9 +.endm + +function inv_adst_4s_x16_neon + AARCH64_VALID_CALL_TARGET + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + ret +endfunc + +function inv_flipadst_4s_x16_neon + AARCH64_VALID_CALL_TARGET + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x16_neon + AARCH64_VALID_CALL_TARGET + movz w16, #2*(5793-4096)*8, lsl #16 + dup v0.2s, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4s, v\i\().4s, v0.s[0] + sqadd v\i\().4s, v\i\().4s, v\i\().4s + sqadd v\i\().4s, v\i\().4s, v2.4s +.endr + ret +endfunc + +.macro identity_4x16_shift1 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + srshr v3.4s, v3.4s, #1 + sqadd \i, \i, v3.4s +.endr +.endm + +.macro identity_4x16 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v3.4s +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + mov x14, x30 + movi v7.4s, #0 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + blr x4 + sqrshrn v16.4h, v16.4s, #\shift + sqrshrn v17.4h, v17.4s, #\shift + sqrshrn v18.4h, v18.4s, #\shift + sqrshrn v19.4h, v19.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift + sqrshrn2 v17.8h, v21.4s, #\shift + sqrshrn2 v18.8h, v22.4s, #\shift + sqrshrn2 v19.8h, v23.4s, #\shift + sqrshrn v20.4h, v24.4s, #\shift + sqrshrn v21.4h, v25.4s, #\shift + sqrshrn v22.4h, v26.4s, #\shift + sqrshrn v23.4h, v27.4s, #\shift + sqrshrn2 v20.8h, v28.4s, #\shift + sqrshrn2 v21.8h, v29.4s, #\shift + sqrshrn2 v22.8h, v30.4s, #\shift + sqrshrn2 v23.8h, v31.4s, #\shift + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 + +.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h + st1 {\i}, [x6], #16 +.endr + + ret x14 +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + ret x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 + ldrh w12, [x13], #2 +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*16*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + ret x15 +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + adr x4, inv_\txfm1\()_4s_x16_neon + movrel x5, X(inv_\txfm2\()_8h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_16x16 +.else + movrel x13, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_16x16_identity +.else + movrel x13, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + mov x15, x30 + movi v4.4s, #0 + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], #16 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + sqrshrn v16.4h, v24.4s, #1 + sqrshrn v17.4h, v25.4s, #1 + sqrshrn v18.4h, v26.4s, #1 + sqrshrn v19.4h, v27.4s, #1 + sqrshrn2 v16.8h, v28.4s, #1 + sqrshrn2 v17.8h, v29.4s, #1 + sqrshrn2 v18.8h, v30.4s, #1 + sqrshrn2 v19.8h, v31.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #16 + load_add_store_8x4 x6, x7 + + ret x15 +endfunc + +function inv_txfm_add_4x16_neon + ldrh w12, [x13, #4] + mov x15, x30 + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 + + b 2f +1: +.irp i, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + sqrshrn v20.4h, v16.4s, #1 + sqrshrn v21.4h, v17.4s, #1 + sqrshrn v22.4h, v18.4s, #1 + sqrshrn v23.4h, v19.4s, #1 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f +1: +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + movi \i, #0 +.endr +2: + + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v2.4s}, [x2], x11 +.endr + blr x4 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x16 x0, x6 + + ret x15 +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_4x16 +.else + movrel x13, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_4x16_identity2 +.else + movrel x13, eob_4x16 +.endif +.endif +.else + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct +def_fn_416 \w, \h, identity, identity +def_fn_416 \w, \h, dct, adst +def_fn_416 \w, \h, dct, flipadst +def_fn_416 \w, \h, dct, identity +def_fn_416 \w, \h, adst, dct +def_fn_416 \w, \h, adst, adst +def_fn_416 \w, \h, adst, flipadst +def_fn_416 \w, \h, flipadst, dct +def_fn_416 \w, \h, flipadst, adst +def_fn_416 \w, \h, flipadst, flipadst +def_fn_416 \w, \h, identity, dct +def_fn_416 \w, \h, adst, identity +def_fn_416 \w, \h, flipadst, identity +def_fn_416 \w, \h, identity, adst +def_fn_416 \w, \h, identity, flipadst +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +function inv_txfm_add_16x8_neon + mov x15, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + sqrshrn v12.4h, v24.4s, #1 + sqrshrn v13.4h, v25.4s, #1 + sqrshrn v14.4h, v26.4s, #1 + sqrshrn v15.4h, v27.4s, #1 + sqrshrn2 v12.8h, v28.4s, #1 + sqrshrn2 v13.8h, v29.4s, #1 + sqrshrn2 v14.8h, v30.4s, #1 + sqrshrn2 v15.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 + + b 2f +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h + movi \i, #0 +.endr +2: + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + movi v4.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + sqrshrn v8.4h, v24.4s, #1 + sqrshrn v9.4h, v25.4s, #1 + sqrshrn v10.4h, v26.4s, #1 + sqrshrn v11.4h, v27.4s, #1 + sqrshrn2 v8.8h, v28.4s, #1 + sqrshrn2 v9.8h, v29.4s, #1 + sqrshrn2 v10.8h, v30.4s, #1 + sqrshrn2 v11.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + + mov v16.16b, v8.16b + mov v17.16b, v9.16b + mov v18.16b, v10.16b + mov v19.16b, v11.16b + mov v20.16b, v12.16b + mov v21.16b, v13.16b + mov v22.16b, v14.16b + mov v23.16b, v15.16b + + blr x5 + + add x0, x0, #16 + load_add_store_8x8 x0, x7 + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret x15 +endfunc + +function inv_txfm_add_8x16_neon + mov x15, x30 + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + ldrh w12, [x13, #4] + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + sqrshrn2 v28.8h, v20.4s, #1 + sqrshrn2 v29.8h, v21.4s, #1 + sqrshrn2 v30.8h, v22.4s, #1 + sqrshrn2 v31.8h, v23.4s, #1 + transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h + movi \i, #0 +.endr + +2: + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + blr x5 + + load_add_store_8x16 x0, x6 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x20 + + ret x15 +endfunc + +const eob_8x16 + .short 10, 43, 75, 128 +endconst + +const eob_8x16_identity1 + .short 4, 64, 96, 128 +endconst + +const eob_8x16_identity2 + .short 4, 8, 12, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_8x16 +.else + movrel x13, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_8x16_identity2 +.else + movrel x13, eob_8x16 +.endif +.endif +.if \h == 8 + ldrh w13, [x13] +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4s_x16_neon + movrel x16, idct_coeffs, 4*16 + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a + mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a + mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a + srshr v16.4s, v2.4s, #12 // t16a + srshr v31.4s, v4.4s, #12 // t31a + mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a + mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a + srshr v24.4s, v6.4s, #12 // t17a + srshr v23.4s, v2.4s, #12 // t30a + mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a + mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a + srshr v20.4s, v4.4s, #12 // t18a + srshr v27.4s, v6.4s, #12 // t29a + mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #4*24 + mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a + srshr v28.4s, v2.4s, #12 // t19a + srshr v19.4s, v4.4s, #12 // t28a + mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a + mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a + srshr v18.4s, v6.4s, #12 // t20a + srshr v29.4s, v2.4s, #12 // t27a + mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a + mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a + srshr v26.4s, v4.4s, #12 // t21a + srshr v21.4s, v6.4s, #12 // t26a + mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a + mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a + srshr v22.4s, v2.4s, #12 // t22a + srshr v25.4s, v4.4s, #12 // t25a + mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a + srshr v30.4s, v6.4s, #12 // t23a + srshr v17.4s, v2.4s, #12 // t24a + + ld1 {v0.4s, v1.4s}, [x16] + + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + + sqsub v2.4s, v16.4s, v24.4s // t17 + sqadd v16.4s, v16.4s, v24.4s // t16 + sqsub v3.4s, v31.4s, v23.4s // t30 + sqadd v31.4s, v31.4s, v23.4s // t31 + sqsub v24.4s, v28.4s, v20.4s // t18 + sqadd v28.4s, v28.4s, v20.4s // t19 + sqadd v23.4s, v18.4s, v26.4s // t20 + sqsub v18.4s, v18.4s, v26.4s // t21 + sqsub v20.4s, v30.4s, v22.4s // t22 + sqadd v30.4s, v30.4s, v22.4s // t23 + sqadd v26.4s, v17.4s, v25.4s // t24 + sqsub v17.4s, v17.4s, v25.4s // t25 + sqsub v22.4s, v29.4s, v21.4s // t26 + sqadd v29.4s, v29.4s, v21.4s // t27 + sqadd v25.4s, v19.4s, v27.4s // t28 + sqsub v19.4s, v19.4s, v27.4s // t29 + +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a + mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a + mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a + srshr v21.4s, v7.4s, #12 // t17a + srshr v27.4s, v6.4s, #12 // t30a + neg v2.4s, v2.4s // -> t18a + mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a + srshr v19.4s, v2.4s, #12 // t18a + srshr v24.4s, v7.4s, #12 // t29a + mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a + mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a + srshr v22.4s, v6.4s, #12 // t21a + srshr v18.4s, v2.4s, #12 // t26a + neg v7.4s, v7.4s // -> t22a + mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a + srshr v17.4s, v7.4s, #12 // t22a + srshr v20.4s, v6.4s, #12 // t25a + + sqsub v2.4s, v27.4s, v24.4s // t29 + sqadd v27.4s, v27.4s, v24.4s // t30 + sqsub v3.4s, v21.4s, v19.4s // t18 + sqadd v21.4s, v21.4s, v19.4s // t17 + sqsub v24.4s, v16.4s, v28.4s // t19a + sqadd v16.4s, v16.4s, v28.4s // t16a + sqsub v19.4s, v30.4s, v23.4s // t20a + sqadd v30.4s, v30.4s, v23.4s // t23a + sqsub v28.4s, v17.4s, v22.4s // t21 + sqadd v17.4s, v17.4s, v22.4s // t22 + sqadd v23.4s, v26.4s, v29.4s // t24a + sqsub v26.4s, v26.4s, v29.4s // t27a + sqadd v22.4s, v20.4s, v18.4s // t25 + sqsub v20.4s, v20.4s, v18.4s // t26 + sqsub v29.4s, v31.4s, v25.4s // t28a + sqadd v31.4s, v31.4s, v25.4s // t31a + +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a + mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a + mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 + srshr v18.4s, v7.4s, #12 // t18a + srshr v25.4s, v6.4s, #12 // t29a + mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 + srshr v29.4s, v2.4s, #12 // t19 + srshr v24.4s, v7.4s, #12 // t28 + neg v6.4s, v6.4s // -> t20 + mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 + mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a + srshr v26.4s, v6.4s, #12 // t20 + srshr v19.4s, v2.4s, #12 // t27 + neg v7.4s, v7.4s // -> t21a + mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a + srshr v20.4s, v7.4s, #12 // t21a + srshr v28.4s, v6.4s, #12 // t26a + + sqsub v2.4s, v16.4s, v30.4s // t23 + sqadd v16.4s, v16.4s, v30.4s // t16 = out16 + sqsub v3.4s, v31.4s, v23.4s // t24 + sqadd v31.4s, v31.4s, v23.4s // t31 = out31 + sqsub v23.4s, v21.4s, v17.4s // t22a + sqadd v17.4s, v21.4s, v17.4s // t17a = out17 + sqadd v30.4s, v27.4s, v22.4s // t30a = out30 + sqsub v21.4s, v27.4s, v22.4s // t25a + sqsub v27.4s, v18.4s, v20.4s // t21 + sqadd v18.4s, v18.4s, v20.4s // t18 = out18 + sqadd v7.4s, v29.4s, v26.4s // t19a = out19 + sqsub v26.4s, v29.4s, v26.4s // t20a + sqadd v29.4s, v25.4s, v28.4s // t29 = out29 + sqsub v25.4s, v25.4s, v28.4s // t26 + sqadd v28.4s, v24.4s, v19.4s // t28a = out28 + sqsub v24.4s, v24.4s, v19.4s // t27a + mov v19.16b, v7.16b // out19 + +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 + mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 + srshr v20.4s, v7.4s, #12 // t20 + srshr v22.4s, v6.4s, #12 // t27 + + mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a + mov v27.16b, v22.16b // t27 + srshr v26.4s, v7.4s, #12 // t26a + + mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 + mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 + srshr v21.4s, v6.4s, #12 // t21a + srshr v22.4s, v24.4s, #12 // t22 + srshr v25.4s, v7.4s, #12 // t25 + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a + srshr v23.4s, v7.4s, #12 // t23a + srshr v24.4s, v6.4s, #12 // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + mov x14, x30 + movi v7.4s, #0 + lsl x8, x8, #1 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 + transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 + +.macro store1 r0, r1, r2, r3 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + st1 {\r2}, [x6], #16 + st1 {\r3}, [x6], #16 +.endm + store1 v16.4s, v20.4s, v24.4s, v28.4s + store1 v17.4s, v21.4s, v25.4s, v29.4s + store1 v18.4s, v22.4s, v26.4s, v30.4s + store1 v19.4s, v23.4s, v27.4s, v31.4s +.purgem store1 + sub x6, x6, #64*4 + + movi v7.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.s[1] + scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_4s_x16_neon + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 + transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 +.macro store2 r0, r1, r2, r3, shift + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] + sqsub v4.4s, v0.4s, \r0 + sqadd v0.4s, v0.4s, \r0 + sqsub v5.4s, v1.4s, \r1 + sqadd v1.4s, v1.4s, \r1 + sqsub v6.4s, v2.4s, \r2 + sqadd v2.4s, v2.4s, \r2 + sqsub v7.4s, v3.4s, \r3 + sqadd v3.4s, v3.4s, \r3 + sqrshrn v0.4h, v0.4s, #\shift + sqrshrn2 v0.8h, v1.4s, #\shift + sqrshrn v1.4h, v2.4s, #\shift + sqrshrn2 v1.8h, v3.4s, #\shift + sqrshrn v2.4h, v7.4s, #\shift + sqrshrn2 v2.8h, v6.4s, #\shift + sqrshrn v3.4h, v5.4s, #\shift + sqrshrn2 v3.8h, v4.4s, #\shift + st1 {v0.8h, v1.8h}, [x6], #32 + rev64 v2.8h, v2.8h + rev64 v3.8h, v3.8h + st1 {v2.8h, v3.8h}, [x6], #32 +.endm + + store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift + store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift + store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift + store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift +.purgem store2 + ret x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl X(inv_dct_8h_x16_neon) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl X(inv_dct32_odd_8h_x16_neon) + + neg x9, x8 + mov x10, x6 + mvni v1.8h, #0xfc, lsl #8 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8h}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8h}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8h}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + usqadd v2.8h, v5.8h + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + ld1 {v5.8h}, [x7], \stride + usqadd v3.8h, v6.8h + smin v2.8h, v2.8h, v1.8h + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8h}, [x6], x1 + ld1 {v2.8h}, [x10], x1 + usqadd v4.8h, v7.8h + smin v3.8h, v3.8h, v1.8h + srshr v5.8h, v5.8h, #4 + st1 {v3.8h}, [x6], x1 + usqadd v2.8h, v5.8h + smin v4.8h, v4.8h, v1.8h + st1 {v4.8h}, [x6], x1 + smin v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + ret x14 +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + .short 10, 43, 75, 107, 139, 171, 203, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + movi v0.8h, #0 + movi v1.8h, #0 + movrel x13, eob_32x32, 2 + + mov x8, #4*32 +1: + mov w9, #0 + movrel x12, eob_32x32, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc + +.macro shift_16_regs op, shift +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movz w16, #2896*8, lsl #16 + movz w17, #2*(5793-4096)*8, lsl #16 + movi v0.4s, #0 + movi v1.4s, #0 + movrel x13, eob_16x32\hshort, 2 + + mov x8, #4*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + dup v2.2s, w16 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + mov v2.s[1], w17 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + +.if \w == 16 + // 16x32 + identity_4x16_shift1 v2.s[1] +.else + // 32x16 + shift_16_regs sqshl, 1 + identity_4x16 v2.s[1] +.endif + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #16 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movi v0.4s, #0 + movi v1.4s, #0 + // Working on 8x8 blocks, read every other entry from eob_8x32 + movrel x13, eob_8x32, 2 + + mov w8, #4*\h +1: + // Working on 8x8 blocks, read every other entry from eob_8x32 + ldrh w12, [x13], #4 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + +.if \w == 8 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn2 v16.8h, v17.4s, #1 + sqrshrn v17.4h, v18.4s, #1 + sqrshrn2 v17.8h, v19.4s, #1 + sqrshrn v18.4h, v20.4s, #1 + sqrshrn2 v18.8h, v21.4s, #1 + sqrshrn v19.4h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + sqrshrn v20.4h, v24.4s, #1 + sqrshrn2 v20.8h, v25.4s, #1 + sqrshrn v21.4h, v26.4s, #1 + sqrshrn2 v21.8h, v27.4s, #1 + sqrshrn v22.4h, v28.4s, #1 + sqrshrn2 v22.8h, v29.4s, #1 + sqrshrn v23.4h, v30.4s, #1 + sqrshrn2 v23.8h, v31.4s, #1 +.else + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #4*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_4s_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + movrel x13, eob_16x32 + movrel x5, X(inv_dct_8h_x16_neon) + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.4s, #0 + mov x8, #4*32 + mov w9, #32 + mov x6, sp + mov x7, x2 +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().4s}, [x7] + st1 {v28.4s}, [x7], x8 +.endr + ldrh w12, [x13], #2 + sub w9, w9, #4 + sub x7, x7, x8, lsl #3 + add x7, x7, #4*4 + + bl inv_dct_4s_x8_neon + + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + sqrshrn2 v16.8h, v20.4s, #2 + sqrshrn2 v17.8h, v21.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 + sqrshrn2 v19.8h, v23.4s, #2 + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + cmp w3, w12 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #4 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + +.irp i, 0, 4 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + cmp w3, #10 + b.lt 1f +.endif + mov x8, #8*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + +2: + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9, lsl #1 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + ret x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.4s, v1.4s}, [x17], #32 + + sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a + sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a + sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a + sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a + sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a + sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a + sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a + sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a + + ld1 {v0.4s}, [x17], #16 + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t33 + sqsub v26.4s, v19.4s, v18.4s // t34 + sqadd v27.4s, v19.4s, v18.4s // t35 + sqadd v28.4s, v20.4s, v21.4s // t60 + sqsub v29.4s, v20.4s, v21.4s // t61 + sqsub v30.4s, v23.4s, v22.4s // t62 + sqadd v31.4s, v23.4s, v22.4s // t63 + +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a + mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a + neg v2.4s, v2.4s // t34a + mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a + srshr v26.4s, v2.4s, #12 // t34a + mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a + srshr v29.4s, v7.4s, #12 // t61a + srshr v25.4s, v6.4s, #12 // t33a + srshr v30.4s, v2.4s, #12 // t62a + + sqadd v16.4s, v24.4s, v27.4s // t32a + sqsub v19.4s, v24.4s, v27.4s // t35a + sqadd v17.4s, v25.4s, v26.4s // t33 + sqsub v18.4s, v25.4s, v26.4s // t34 + sqsub v20.4s, v31.4s, v28.4s // t60a + sqadd v23.4s, v31.4s, v28.4s // t63a + sqsub v21.4s, v30.4s, v29.4s // t61 + sqadd v22.4s, v30.4s, v29.4s // t62 + +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a + mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 + srshr v21.4s, v2.4s, #12 // t61a + srshr v18.4s, v7.4s, #12 // t34a + mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 + srshr v20.4s, v6.4s, #12 // t60 + srshr v19.4s, v2.4s, #12 // t35 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #4*4*0] // t32a + ldr q17, [x9, #4*4*8] // t39a + ldr q18, [x9, #4*4*0] // t63a + ldr q19, [x6, #4*4*8] // t56a + ldr q20, [x6, #4*4*16] // t40a + ldr q21, [x9, #4*4*24] // t47a + ldr q22, [x9, #4*4*16] // t55a + ldr q23, [x6, #4*4*24] // t48a + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t39 + sqadd v26.4s, v18.4s, v19.4s // t63 + sqsub v27.4s, v18.4s, v19.4s // t56 + sqsub v28.4s, v21.4s, v20.4s // t40 + sqadd v29.4s, v21.4s, v20.4s // t47 + sqadd v30.4s, v23.4s, v22.4s // t48 + sqsub v31.4s, v23.4s, v22.4s // t55 + +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a + mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a + srshr v25.4s, v2.4s, #12 // t56a + srshr v27.4s, v7.4s, #12 // t39a + neg v6.4s, v6.4s // t40a + mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a + srshr v31.4s, v6.4s, #12 // t40a + srshr v28.4s, v2.4s, #12 // t55a + + sqadd v16.4s, v24.4s, v29.4s // t32a + sqsub v19.4s, v24.4s, v29.4s // t47a + sqadd v17.4s, v27.4s, v31.4s // t39 + sqsub v18.4s, v27.4s, v31.4s // t40 + sqsub v20.4s, v26.4s, v30.4s // t48a + sqadd v23.4s, v26.4s, v30.4s // t63a + sqsub v21.4s, v25.4s, v28.4s // t55 + sqadd v22.4s, v25.4s, v28.4s // t56 + +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a + mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 + srshr v18.4s, v2.4s, #12 // t40a + srshr v21.4s, v7.4s, #12 // t55a + mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 + srshr v19.4s, v6.4s, #12 // t47 + srshr v20.4s, v2.4s, #12 // t48 + + str q16, [x6, #4*4*0] // t32a + str q17, [x9, #4*4*0] // t39 + str q18, [x6, #4*4*8] // t40a + str q19, [x9, #4*4*8] // t47 + str q20, [x6, #4*4*16] // t48 + str q21, [x9, #4*4*16] // t55a + str q22, [x6, #4*4*24] // t56 + str q23, [x9, #4*4*24] // t63a + + add x6, x6, #4*4 + sub x9, x9, #4*4 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movz16dup_if reg, gpr, val, cond +.if \cond + movz \gpr, \val, lsl #16 + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4s_x64_neon + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + + store16 x6 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_4s_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.4s}, [x6], #16 + ld1 {v3.4s}, [x6], #16 + sqadd v6.4s, v2.4s, \r0 + sqsub \r0, v2.4s, \r0 + ld1 {v4.4s}, [x6], #16 + sqadd v7.4s, v3.4s, \r1 + sqsub \r1, v3.4s, \r1 + smin v6.4s, v6.4s, v1.4s + smin \r0, \r0, v1.4s + ld1 {v5.4s}, [x6], #16 + sqadd v2.4s, v4.4s, \r2 + sub x6, x6, #16*4 + smax v6.4s, v6.4s, v0.4s + smax \r0, \r0, v0.4s + sqsub \r2, v4.4s, \r2 + smin v7.4s, v7.4s, v1.4s + smin \r1, \r1, v1.4s + st1 {v6.4s}, [x6], #16 + st1 {\r0}, [x10], x9 + smin v2.4s, v2.4s, v1.4s + smin \r2, \r2, v1.4s + smax v7.4s, v7.4s, v0.4s + smax \r1, \r1, v0.4s + sqadd v3.4s, v5.4s, \r3 + sqsub \r3, v5.4s, \r3 + smax v2.4s, v2.4s, v0.4s + smax \r2, \r2, v0.4s + smin v3.4s, v3.4s, v1.4s + smin \r3, \r3, v1.4s + st1 {v7.4s}, [x6], #16 + st1 {\r1}, [x10], x9 + smax v3.4s, v3.4s, v0.4s + smax \r3, \r3, v0.4s + st1 {v2.4s}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.4s}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.4s, v30.4s, v29.4s, v28.4s + store_addsub v27.4s, v26.4s, v25.4s, v24.4s + store_addsub v23.4s, v22.4s, v21.4s, v20.4s + store_addsub v19.4s, v18.4s, v17.4s, v16.4s +.purgem store_addsub + + add x6, x6, #4*4*16 + + movrel x17, idct64_coeffs + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.4s}, [x7] // in1 (offset 0) + ld1 {v17.4s}, [x9] // in31 (offset 15) + ld1 {v18.4s}, [x10] // in17 (offset 8) + ld1 {v19.4s}, [x11] // in15 (offset 7) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.4s}, [x10] // in7 (offset 3) + ld1 {v17.4s}, [x11] // in25 (offset 12) + ld1 {v18.4s}, [x9] // in23 (offset 11) + ld1 {v19.4s}, [x7] // in9 (offset 4) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #4*4*32 + add x9, x6, #4*4*7 + + bl inv_dct64_step2_neon + + ret x14 +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x4_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #4*4*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-4*4*4 + + dup v7.4s, w12 +1: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.4s, \src0, \src1 + sqadd v0.4s, \src0, \src1 + sqsub v3.4s, \src2, \src3 + srshl v1.4s, v1.4s, v7.4s + sqadd v2.4s, \src2, \src3 + srshl v3.4s, v3.4s, v7.4s + srshl v0.4s, v0.4s, v7.4s + srshl v2.4s, v2.4s, v7.4s + sqxtn v3.4h, v3.4s + sqxtn2 v3.8h, v1.4s + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v2.4s + rev64 v3.8h, v3.8h + st1 {v0.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.4s, v31.4s, v20.4s, v27.4s + store_addsub v17.4s, v30.4s, v21.4s, v26.4s + store_addsub v18.4s, v29.4s, v22.4s, v25.4s + store_addsub v19.4s, v28.4s, v23.4s, v24.4s +.purgem store_addsub + sub x6, x6, x10, lsl #2 + sub x9, x9, x10, lsl #2 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + ret x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + + mvni v7.8h, #0xfc, lsl #8 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8h}, [x6], x1 + ld1 {v1.8h}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8h}, [x6] + sqsub \src0, \src0, \src1 + ld1 {v3.8h}, [x9] + sqadd v5.8h, \src2, \src3 + sqsub \src2, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr \src0, \src0, #4 + usqadd v0.8h, v4.8h + srshr \src2, \src2, #4 + usqadd v1.8h, \src0 + usqadd v2.8h, v5.8h + smin v0.8h, v0.8h, v7.8h + usqadd v3.8h, \src2 + smin v1.8h, v1.8h, v7.8h + st1 {v0.8h}, [x6], x1 + smin v2.8h, v2.8h, v7.8h + st1 {v1.8h}, [x9], x10 + smin v3.8h, v3.8h, v7.8h + st1 {v2.8h}, [x6], x1 + st1 {v3.8h}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + ret x14 +endfunc + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*4*4 + add x4, sp, #64*4*4 + + movrel x13, eob_16x32 + +.irp i, 0, 4, 8, 12 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + movrel x5, X(inv_dct_8h_x16_neon) +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + ret x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_4s_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + ret x15 +endfunc diff --git a/third_party/dav1d/src/arm/64/loopfilter.S b/third_party/dav1d/src/arm/64/loopfilter.S new file mode 100644 index 0000000000..63d5de10ad --- /dev/null +++ b/third_party/dav1d/src/arm/64/loopfilter.S @@ -0,0 +1,1129 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels +.macro loop_filter wd +function lpf_16_wd\wd\()_neon + uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) + uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0) + uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0) + uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1) +.if \wd >= 6 + uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1) + uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1) +.endif +.if \wd >= 8 + uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2) + uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3) +.endif +.if \wd >= 6 + umax v4.16b, v4.16b, v5.16b +.endif + uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2 +.if \wd >= 8 + umax v6.16b, v6.16b, v7.16b +.endif + ushr v3.16b, v3.16b, #1 +.if \wd >= 8 + umax v4.16b, v4.16b, v6.16b +.endif +.if \wd >= 6 + and v4.16b, v4.16b, v14.16b +.endif + umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0)) + uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + umax v4.16b, v0.16b, v4.16b + cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + and v1.16b, v1.16b, v2.16b // fm + and v1.16b, v1.16b, v13.16b // fm && wd >= 4 +.if \wd >= 6 + and v14.16b, v14.16b, v1.16b // fm && wd > 4 +.endif +.if \wd >= 16 + and v15.16b, v15.16b, v1.16b // fm && wd == 16 +.endif + + mov x16, v1.d[0] + mov x17, v1.d[1] + adds x16, x16, x17 + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: +.if \wd >= 6 + movi v10.16b, #1 + uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) + uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0) + uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0) + uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0) +.if \wd >= 8 + uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0) + uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0) +.endif + umax v2.16b, v2.16b, v3.16b + umax v4.16b, v4.16b, v5.16b +.if \wd >= 8 + umax v6.16b, v6.16b, v7.16b +.endif + umax v2.16b, v2.16b, v4.16b +.if \wd >= 8 + umax v2.16b, v2.16b, v6.16b +.endif + +.if \wd == 16 + uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0) + uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0) + uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0) +.endif + cmhs v2.16b, v10.16b, v2.16b // flat8in +.if \wd == 16 + uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0) + uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0) + uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0) +.endif + and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 + bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in +.if \wd == 16 + umax v3.16b, v3.16b, v4.16b + umax v5.16b, v5.16b, v6.16b +.endif + mov x16, v1.d[0] + mov x17, v1.d[1] +.if \wd == 16 + umax v7.16b, v7.16b, v8.16b + umax v3.16b, v3.16b, v5.16b + umax v3.16b, v3.16b, v7.16b + cmhs v3.16b, v10.16b, v3.16b // flat8out +.endif + adds x16, x16, x17 +.if \wd == 16 + and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 + and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 + bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out +.endif + b.eq 1f // skip wd == 4 case +.endif + movi v3.16b, #128 + eor v2.16b, v22.16b, v3.16b // p1 - 128 + eor v3.16b, v25.16b, v3.16b // q1 - 128 + cmhi v0.16b, v0.16b, v12.16b // hev + sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) + and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) + bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) + usubl v2.8h, v24.8b, v23.8b + movi v5.8h, #3 + usubl2 v3.8h, v24.16b, v23.16b + mul v2.8h, v2.8h, v5.8h + mul v3.8h, v3.8h, v5.8h + movi v6.16b, #4 + saddw v2.8h, v2.8h, v4.8b + saddw2 v3.8h, v3.8h, v4.16b + movi v7.16b, #3 + sqxtn v2.8b, v2.8h // f + sqxtn2 v2.16b, v3.8h + sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127) + sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) + sshr v4.16b, v4.16b, #3 // f1 + sshr v5.16b, v5.16b, #3 // f2 + mov v2.16b, v23.16b // p0 + mov v3.16b, v24.16b // q0 + neg v6.16b, v4.16b // -f1 + srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 + // p0 + f2, q0 - f1 + usqadd v2.16b, v5.16b // out p0 + usqadd v3.16b, v6.16b // out q0 + neg v6.16b, v4.16b // -((f1 + 1) >> 1) + bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) + bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) + mov v2.16b, v22.16b // p1 + mov v3.16b, v25.16b // q1 + // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) + usqadd v2.16b, v4.16b // out p1 + usqadd v3.16b, v6.16b // out q1 + bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 2f // skip if there's no flat8in + + uaddl v0.8h, v21.8b, v21.8b // p2 * 2 + uaddl2 v1.8h, v21.16b, v21.16b + uaddl v2.8h, v21.8b, v22.8b // p2 + p1 + uaddl2 v3.8h, v21.16b, v22.16b + uaddl v4.8h, v22.8b, v23.8b // p1 + p0 + uaddl2 v5.8h, v22.16b, v23.16b + uaddl v6.8h, v23.8b, v24.8b // p0 + q0 + uaddl2 v7.8h, v23.16b, v24.16b + add v8.8h, v0.8h, v2.8h + add v9.8h, v1.8h, v3.8h + add v10.8h, v4.8h, v6.8h + add v11.8h, v5.8h, v7.8h + uaddl v12.8h, v24.8b, v25.8b // q0 + q1 + uaddl2 v13.8h, v24.16b, v25.16b + add v8.8h, v8.8h, v10.8h + add v9.8h, v9.8h, v11.8h + sub v12.8h, v12.8h, v0.8h + sub v13.8h, v13.8h, v1.8h + uaddl v10.8h, v25.8b, v26.8b // q1 + q2 + uaddl2 v11.8h, v25.16b, v26.16b + rshrn v0.8b, v8.8h, #3 // out p1 + rshrn2 v0.16b, v9.8h, #3 + + add v8.8h, v8.8h, v12.8h + add v9.8h, v9.8h, v13.8h + sub v10.8h, v10.8h, v2.8h + sub v11.8h, v11.8h, v3.8h + uaddl v12.8h, v26.8b, v26.8b // q2 + q2 + uaddl2 v13.8h, v26.16b, v26.16b + rshrn v1.8b, v8.8h, #3 // out p0 + rshrn2 v1.16b, v9.8h, #3 + + add v8.8h, v8.8h, v10.8h + add v9.8h, v9.8h, v11.8h + sub v12.8h, v12.8h, v4.8h + sub v13.8h, v13.8h, v5.8h + rshrn v2.8b, v8.8h, #3 // out q0 + rshrn2 v2.16b, v9.8h, #3 + + bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) + add v8.8h, v8.8h, v12.8h + add v9.8h, v9.8h, v13.8h + bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) + rshrn v3.8b, v8.8h, #3 // out q1 + rshrn2 v3.16b, v9.8h, #3 + bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) + bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) +.elseif \wd >= 8 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 +.if \wd == 8 + b.eq 8f // skip if there's no flat8in +.else + b.eq 2f // skip if there's no flat8in +.endif + + uaddl v0.8h, v20.8b, v21.8b // p3 + p2 + uaddl2 v1.8h, v20.16b, v21.16b + uaddl v2.8h, v22.8b, v25.8b // p1 + q1 + uaddl2 v3.8h, v22.16b, v25.16b + uaddl v4.8h, v20.8b, v22.8b // p3 + p1 + uaddl2 v5.8h, v20.16b, v22.16b + uaddl v6.8h, v23.8b, v26.8b // p0 + q2 + uaddl2 v7.8h, v23.16b, v26.16b + add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) + add v9.8h, v1.8h, v1.8h + uaddw v8.8h, v8.8h, v23.8b // + p0 + uaddw2 v9.8h, v9.8h, v23.16b + uaddw v8.8h, v8.8h, v24.8b // + q0 + uaddw2 v9.8h, v9.8h, v24.16b + add v8.8h, v8.8h, v4.8h + add v9.8h, v9.8h, v5.8h // + p3 + p1 + sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 + sub v3.8h, v3.8h, v1.8h + sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 + sub v7.8h, v7.8h, v5.8h + rshrn v10.8b, v8.8h, #3 // out p2 + rshrn2 v10.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h, v9.8h, v3.8h + uaddl v0.8h, v20.8b, v23.8b // p3 + p0 + uaddl2 v1.8h, v20.16b, v23.16b + uaddl v2.8h, v24.8b, v27.8b // q0 + q3 + uaddl2 v3.8h, v24.16b, v27.16b + rshrn v11.8b, v8.8h, #3 // out p1 + rshrn2 v11.16b, v9.8h, #3 + + add v8.8h, v8.8h, v6.8h + add v9.8h, v9.8h, v7.8h + sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 + sub v3.8h, v3.8h, v1.8h + uaddl v4.8h, v21.8b, v24.8b // p2 + q0 + uaddl2 v5.8h, v21.16b, v24.16b + uaddl v6.8h, v25.8b, v27.8b // q1 + q3 + uaddl2 v7.8h, v25.16b, v27.16b + rshrn v12.8b, v8.8h, #3 // out p0 + rshrn2 v12.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h, v9.8h, v3.8h + sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 + sub v7.8h, v7.8h, v5.8h + uaddl v0.8h, v22.8b, v25.8b // p1 + q1 + uaddl2 v1.8h, v22.16b, v25.16b + uaddl v2.8h, v26.8b, v27.8b // q2 + q3 + uaddl2 v3.8h, v26.16b, v27.16b + rshrn v13.8b, v8.8h, #3 // out q0 + rshrn2 v13.16b, v9.8h, #3 + + add v8.8h, v8.8h, v6.8h + add v9.8h, v9.8h, v7.8h + sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 + sub v3.8h, v3.8h, v1.8h + rshrn v0.8b, v8.8h, #3 // out q1 + rshrn2 v0.16b, v9.8h, #3 + + add v8.8h, v8.8h, v2.8h + add v9.8h , v9.8h, v3.8h + + bit v21.16b, v10.16b, v14.16b + bit v22.16b, v11.16b, v14.16b + bit v23.16b, v12.16b, v14.16b + rshrn v1.8b, v8.8h, #3 // out q2 + rshrn2 v1.16b, v9.8h, #3 + bit v24.16b, v13.16b, v14.16b + bit v25.16b, v0.16b, v14.16b + bit v26.16b, v1.16b, v14.16b +.endif +2: +.if \wd == 16 + mov x16, v15.d[0] + mov x17, v15.d[1] + adds x16, x16, x17 + b.ne 1f // check if flat8out is needed + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + uaddl v2.8h, v17.8b, v17.8b // p6 + p6 + uaddl2 v3.8h, v17.16b, v17.16b + uaddl v4.8h, v17.8b, v18.8b // p6 + p5 + uaddl2 v5.8h, v17.16b, v18.16b + uaddl v6.8h, v17.8b, v19.8b // p6 + p4 + uaddl2 v7.8h, v17.16b, v19.16b + uaddl v8.8h, v17.8b, v20.8b // p6 + p3 + uaddl2 v9.8h, v17.16b, v20.16b + add v12.8h, v2.8h, v4.8h + add v13.8h, v3.8h, v5.8h + add v10.8h, v6.8h, v8.8h + add v11.8h, v7.8h, v9.8h + uaddl v6.8h, v17.8b, v21.8b // p6 + p2 + uaddl2 v7.8h, v17.16b, v21.16b + add v12.8h, v12.8h, v10.8h + add v13.8h, v13.8h, v11.8h + uaddl v8.8h, v17.8b, v22.8b // p6 + p1 + uaddl2 v9.8h, v17.16b, v22.16b + uaddl v10.8h, v18.8b, v23.8b // p5 + p0 + uaddl2 v11.8h, v18.16b, v23.16b + add v6.8h, v6.8h, v8.8h + add v7.8h, v7.8h, v9.8h + uaddl v8.8h, v19.8b, v24.8b // p4 + q0 + uaddl2 v9.8h, v19.16b, v24.16b + add v12.8h, v12.8h, v6.8h + add v13.8h, v13.8h, v7.8h + add v10.8h, v10.8h, v8.8h + add v11.8h, v11.8h, v9.8h + uaddl v6.8h, v20.8b, v25.8b // p3 + q1 + uaddl2 v7.8h, v20.16b, v25.16b + add v12.8h, v12.8h, v10.8h + add v13.8h, v13.8h, v11.8h + sub v6.8h, v6.8h, v2.8h + sub v7.8h, v7.8h, v3.8h + uaddl v2.8h, v21.8b, v26.8b // p2 + q2 + uaddl2 v3.8h, v21.16b, v26.16b + rshrn v0.8b, v12.8h, #4 // out p5 + rshrn2 v0.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) + add v13.8h, v13.8h, v7.8h + sub v2.8h, v2.8h, v4.8h + sub v3.8h, v3.8h, v5.8h + uaddl v4.8h, v22.8b, v27.8b // p1 + q3 + uaddl2 v5.8h, v22.16b, v27.16b + uaddl v6.8h, v17.8b, v19.8b // p6 + p4 + uaddl2 v7.8h, v17.16b, v19.16b + rshrn v1.8b, v12.8h, #4 // out p4 + rshrn2 v1.16b, v13.8h, #4 + add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) + add v13.8h, v13.8h, v3.8h + sub v4.8h, v4.8h, v6.8h + sub v5.8h, v5.8h, v7.8h + uaddl v6.8h, v23.8b, v28.8b // p0 + q4 + uaddl2 v7.8h, v23.16b, v28.16b + uaddl v8.8h, v17.8b, v20.8b // p6 + p3 + uaddl2 v9.8h, v17.16b, v20.16b + rshrn v2.8b, v12.8h, #4 // out p3 + rshrn2 v2.16b, v13.8h, #4 + add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) + add v13.8h, v13.8h, v5.8h + sub v6.8h, v6.8h, v8.8h + sub v7.8h, v7.8h, v9.8h + uaddl v8.8h, v24.8b, v29.8b // q0 + q5 + uaddl2 v9.8h, v24.16b, v29.16b + uaddl v4.8h, v17.8b, v21.8b // p6 + p2 + uaddl2 v5.8h, v17.16b, v21.16b + rshrn v3.8b, v12.8h, #4 // out p2 + rshrn2 v3.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) + add v13.8h, v13.8h, v7.8h + sub v8.8h, v8.8h, v4.8h + sub v9.8h, v9.8h, v5.8h + uaddl v6.8h, v25.8b, v30.8b // q1 + q6 + uaddl2 v7.8h, v25.16b, v30.16b + uaddl v10.8h, v17.8b, v22.8b // p6 + p1 + uaddl2 v11.8h, v17.16b, v22.16b + rshrn v4.8b, v12.8h, #4 // out p1 + rshrn2 v4.16b, v13.8h, #4 + add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) + add v13.8h, v13.8h, v9.8h + sub v6.8h, v6.8h, v10.8h + sub v7.8h, v7.8h, v11.8h + uaddl v8.8h, v26.8b, v30.8b // q2 + q6 + uaddl2 v9.8h, v26.16b, v30.16b + bif v0.16b, v18.16b, v15.16b // out p5 + uaddl v10.8h, v18.8b, v23.8b // p5 + p0 + uaddl2 v11.8h, v18.16b, v23.16b + rshrn v5.8b, v12.8h, #4 // out p0 + rshrn2 v5.16b, v13.8h, #4 + add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) + add v13.8h, v13.8h, v7.8h + sub v8.8h, v8.8h, v10.8h + sub v9.8h, v9.8h, v11.8h + uaddl v10.8h, v27.8b, v30.8b // q3 + q6 + uaddl2 v11.8h, v27.16b, v30.16b + bif v1.16b, v19.16b, v15.16b // out p4 + uaddl v18.8h, v19.8b, v24.8b // p4 + q0 + uaddl2 v19.8h, v19.16b, v24.16b + rshrn v6.8b, v12.8h, #4 // out q0 + rshrn2 v6.16b, v13.8h, #4 + add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) + add v13.8h, v13.8h, v9.8h + sub v10.8h, v10.8h, v18.8h + sub v11.8h, v11.8h, v19.8h + uaddl v8.8h, v28.8b, v30.8b // q4 + q6 + uaddl2 v9.8h, v28.16b, v30.16b + bif v2.16b, v20.16b, v15.16b // out p3 + uaddl v18.8h, v20.8b, v25.8b // p3 + q1 + uaddl2 v19.8h, v20.16b, v25.16b + rshrn v7.8b, v12.8h, #4 // out q1 + rshrn2 v7.16b, v13.8h, #4 + add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) + add v13.8h, v13.8h, v11.8h + sub v18.8h, v8.8h, v18.8h + sub v19.8h, v9.8h, v19.8h + uaddl v10.8h, v29.8b, v30.8b // q5 + q6 + uaddl2 v11.8h, v29.16b, v30.16b + bif v3.16b, v21.16b, v15.16b // out p2 + uaddl v20.8h, v21.8b, v26.8b // p2 + q2 + uaddl2 v21.8h, v21.16b, v26.16b + rshrn v8.8b, v12.8h, #4 // out q2 + rshrn2 v8.16b, v13.8h, #4 + add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) + add v13.8h, v13.8h, v19.8h + sub v10.8h, v10.8h, v20.8h + sub v11.8h, v11.8h, v21.8h + uaddl v18.8h, v30.8b, v30.8b // q6 + q6 + uaddl2 v19.8h, v30.16b, v30.16b + bif v4.16b, v22.16b, v15.16b // out p1 + uaddl v20.8h, v22.8b, v27.8b // p1 + q3 + uaddl2 v21.8h, v22.16b, v27.16b + rshrn v9.8b, v12.8h, #4 // out q3 + rshrn2 v9.16b, v13.8h, #4 + add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) + add v13.8h, v13.8h, v11.8h + sub v18.8h, v18.8h, v20.8h + sub v19.8h, v19.8h, v21.8h + bif v5.16b, v23.16b, v15.16b // out p0 + rshrn v10.8b, v12.8h, #4 // out q4 + rshrn2 v10.16b, v13.8h, #4 + add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) + add v13.8h, v13.8h, v19.8h + rshrn v11.8b, v12.8h, #4 // out q5 + rshrn2 v11.16b, v13.8h, #4 + bif v6.16b, v24.16b, v15.16b // out q0 + bif v7.16b, v25.16b, v15.16b // out q1 + bif v8.16b, v26.16b, v15.16b // out q2 + bif v9.16b, v27.16b, v15.16b // out q3 + bif v10.16b, v28.16b, v15.16b // out q4 + bif v11.16b, v29.16b, v15.16b // out q5 +.endif + + mov x14, #0 + ret +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + mov x14, #(1 << 6) + ret +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + mov x14, #(1 << 4) + ret +.endif +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_16_wd16 + bl lpf_16_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: +.endm + +.macro lpf_16_wd8 + bl lpf_16_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: +.endm + +.macro lpf_16_wd6 + bl lpf_16_wd6_neon + cbz x14, 1f + ret x15 +1: +.endm + +.macro lpf_16_wd4 + bl lpf_16_wd4_neon + cbz x14, 1f + ret x15 +1: +.endm + +function lpf_v_4_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + + lpf_16_wd4 + + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_4_16_neon + mov x15, x30 + sub x16, x0, #2 + add x0, x16, x1, lsl #3 + ld1 {v22.s}[0], [x16], x1 + ld1 {v22.s}[2], [x0], x1 + ld1 {v23.s}[0], [x16], x1 + ld1 {v23.s}[2], [x0], x1 + ld1 {v24.s}[0], [x16], x1 + ld1 {v24.s}[2], [x0], x1 + ld1 {v25.s}[0], [x16], x1 + ld1 {v25.s}[2], [x0], x1 + ld1 {v22.s}[1], [x16], x1 + ld1 {v22.s}[3], [x0], x1 + ld1 {v23.s}[1], [x16], x1 + ld1 {v23.s}[3], [x0], x1 + ld1 {v24.s}[1], [x16], x1 + ld1 {v24.s}[3], [x0], x1 + ld1 {v25.s}[1], [x16], x1 + ld1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd4 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + ret x15 +endfunc + +function lpf_v_6_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + lpf_16_wd6 + + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_6_16_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #3 + ld1 {v20.d}[0], [x16], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.d}[0], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.d}[0], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd6 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + ret x15 +endfunc + +function lpf_v_8_16_neon + mov x15, x30 + sub x16, x0, x1, lsl #2 + ld1 {v20.16b}, [x16], x1 // p3 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v27.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + + lpf_16_wd8 + + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + st1 {v21.16b}, [x16], x1 // p2 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v25.16b}, [x0], x1 // q1 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + ret x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_8_16_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #3 + ld1 {v20.d}[0], [x16], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.d}[0], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.d}[0], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_16_wd8 + + sub x16, x0, x1, lsl #4 + sub x16, x16, #4 + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v20.d}[0], [x16], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.d}[0], [x16], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.d}[0], [x16], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.d}[0], [x16], x1 + st1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +8: + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + ret x15 +endfunc + +function lpf_v_16_16_neon + mov x15, x30 + + sub x16, x0, x1, lsl #3 + add x16, x16, x1 + ld1 {v17.16b}, [x16], x1 // p6 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v18.16b}, [x16], x1 // p5 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v19.16b}, [x16], x1 // p4 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v20.16b}, [x16], x1 // p3 + ld1 {v27.16b}, [x0], x1 // q3 + ld1 {v21.16b}, [x16], x1 // p2 + ld1 {v28.16b}, [x0], x1 // q4 + ld1 {v22.16b}, [x16], x1 // p1 + ld1 {v29.16b}, [x0], x1 // q5 + ld1 {v23.16b}, [x16], x1 // p0 + ld1 {v30.16b}, [x0], x1 // q6 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + lpf_16_wd16 + + sub x16, x0, x1, lsl #2 + sub x16, x16, x1, lsl #1 + st1 {v0.16b}, [x16], x1 // p5 + st1 {v6.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x16], x1 // p4 + st1 {v7.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x16], x1 // p3 + st1 {v8.16b}, [x0], x1 // q2 + st1 {v3.16b}, [x16], x1 // p2 + st1 {v9.16b}, [x0], x1 // q3 + st1 {v4.16b}, [x16], x1 // p1 + st1 {v10.16b}, [x0], x1 // q4 + st1 {v5.16b}, [x16], x1 // p0 + st1 {v11.16b}, [x0], x1 // q5 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ret x15 +7: + sub x16, x0, x1 + sub x16, x16, x1, lsl #1 + st1 {v21.16b}, [x16], x1 // p2 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v25.16b}, [x0], x1 // q1 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v26.16b}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + ret x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.16b}, [x16], x1 // p1 + st1 {v24.16b}, [x0], x1 // q0 + st1 {v23.16b}, [x16], x1 // p0 + st1 {v25.16b}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_16_16_neon + mov x15, x30 + sub x16, x0, #8 + ld1 {v16.d}[0], [x16], x1 + ld1 {v24.d}[0], [x0], x1 + ld1 {v17.d}[0], [x16], x1 + ld1 {v25.d}[0], [x0], x1 + ld1 {v18.d}[0], [x16], x1 + ld1 {v26.d}[0], [x0], x1 + ld1 {v19.d}[0], [x16], x1 + ld1 {v27.d}[0], [x0], x1 + ld1 {v20.d}[0], [x16], x1 + ld1 {v28.d}[0], [x0], x1 + ld1 {v21.d}[0], [x16], x1 + ld1 {v29.d}[0], [x0], x1 + ld1 {v22.d}[0], [x16], x1 + ld1 {v30.d}[0], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v31.d}[0], [x0], x1 + ld1 {v16.d}[1], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v17.d}[1], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v18.d}[1], [x16], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v19.d}[1], [x16], x1 + ld1 {v27.d}[1], [x0], x1 + ld1 {v20.d}[1], [x16], x1 + ld1 {v28.d}[1], [x0], x1 + ld1 {v21.d}[1], [x16], x1 + ld1 {v29.d}[1], [x0], x1 + ld1 {v22.d}[1], [x16], x1 + ld1 {v30.d}[1], [x0], x1 + ld1 {v23.d}[1], [x16], x1 + ld1 {v31.d}[1], [x0], x1 + + transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lpf_16_wd16 + + sub x0, x0, x1, lsl #4 + sub x16, x0, #8 + + transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 + transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 + + st1 {v16.d}[0], [x16], x1 + st1 {v6.d}[0], [x0], x1 + st1 {v17.d}[0], [x16], x1 + st1 {v7.d}[0], [x0], x1 + st1 {v0.d}[0], [x16], x1 + st1 {v8.d}[0], [x0], x1 + st1 {v1.d}[0], [x16], x1 + st1 {v9.d}[0], [x0], x1 + st1 {v2.d}[0], [x16], x1 + st1 {v10.d}[0], [x0], x1 + st1 {v3.d}[0], [x16], x1 + st1 {v11.d}[0], [x0], x1 + st1 {v4.d}[0], [x16], x1 + st1 {v30.d}[0], [x0], x1 + st1 {v5.d}[0], [x16], x1 + st1 {v31.d}[0], [x0], x1 + st1 {v16.d}[1], [x16], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v17.d}[1], [x16], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x16], x1 + st1 {v8.d}[1], [x0], x1 + st1 {v1.d}[1], [x16], x1 + st1 {v9.d}[1], [x0], x1 + st1 {v2.d}[1], [x16], x1 + st1 {v10.d}[1], [x0], x1 + st1 {v3.d}[1], [x16], x1 + st1 {v11.d}[1], [x0], x1 + st1 {v4.d}[1], [x16], x1 + st1 {v30.d}[1], [x0], x1 + st1 {v5.d}[1], [x16], x1 + st1 {v31.d}[1], [x0], x1 + ret x15 + +7: + sub x16, x0, x1, lsl #4 + sub x16, x16, #4 + transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v20.d}[0], [x16], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.d}[0], [x16], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.d}[0], [x16], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.d}[0], [x16], x1 + st1 {v27.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +8: + sub x16, x0, x1, lsl #4 + sub x16, x16, #2 + transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #3 + + st1 {v22.s}[0], [x16], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x16], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x16], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x16], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x16], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x16], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x16], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x16], x1 + st1 {v25.s}[3], [x0], x1 + add x0, x0, #2 + ret x15 +endfunc + +// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 + mov x11, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp w6, w7, [x2] // vmask[0], vmask[1] +.ifc \type, y + ldr w2, [x2, #8] // vmask[2] +.endif + add x5, x5, #128 // Move to sharp part of lut +.ifc \type, y + orr w7, w7, w2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub x4, x3, x4, lsl #2 +.else + sub x3, x3, #4 + lsl x4, x4, #2 +.endif + orr w6, w6, w7 // vmask[0] |= vmask[1] + +1: + tst w6, #0x0f +.ifc \dir, v + ld1 {v0.16b}, [x4], #16 + ld1 {v1.16b}, [x3], #16 +.else + ld2 {v0.s,v1.s}[0], [x3], x4 + ld2 {v0.s,v1.s}[1], [x3], x4 + ld2 {v0.s,v1.s}[2], [x3], x4 + ld2 {v0.s,v1.s}[3], [x3], x4 +.endif + b.eq 7f // if (!(vm & bits)) continue; + + ld1r {v5.16b}, [x5] // sharp[0] + add x5, x5, #8 + movi v2.4s, #0xff + dup v13.4s, w6 // vmask[0] + + and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word + and v1.16b, v1.16b, v2.16b + cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0] + movi v4.16b, #1 + ld1r {v6.16b}, [x5] // sharp[1] + sub x5, x5, #8 + bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] + cmtst v2.4s, v1.4s, v2.4s // L != 0 + mul v1.4s, v1.4s, v4.4s // L +.ifc \type, y + dup v15.4s, w2 // vmask[2] +.endif + dup v14.4s, w7 // vmask[1] + mov x16, v2.d[0] + mov x17, v2.d[1] + adds x16, x16, x17 + b.eq 7f // if (!L) continue; + neg v5.16b, v5.16b // -sharp[0] + movrel x16, word_1248 + ushr v12.16b, v1.16b, #4 // H + ld1 {v16.4s}, [x16] + sshl v3.16b, v1.16b, v5.16b // L >> sharp[0] +.ifc \type, y + cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits) +.endif + movi v7.16b, #2 + umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1]) + add v0.16b, v1.16b, v7.16b // L + 2 + umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I + add v0.16b, v0.16b, v0.16b // 2*(L + 2) + cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits) + add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E + cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits) + and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0 + +.ifc \type, y + tst w2, #0x0f + b.eq 2f + // wd16 + bl lpf_\dir\()_16_16_neon + b 8f +2: +.endif + tst w7, #0x0f + b.eq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_16_neon +.else + // wd6 + bl lpf_\dir\()_6_16_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_16_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment x0. + // If the whole function is skipped, increment it here instead. + add x0, x0, x1, lsl #4 +.else +7: +.endif +8: + lsr w6, w6, #4 // vmask[0] >>= 4 + lsr w7, w7, #4 // vmask[1] >>= 4 +.ifc \type, y + lsr w2, w2, #4 // vmask[2] >>= 4 +.endif +.ifc \dir, v + add x0, x0, #16 +.else + // For dir h, x0 is returned incremented +.endif + cbnz w6, 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret x11 +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_1248 + .word 1, 2, 4, 8 +endconst diff --git a/third_party/dav1d/src/arm/64/loopfilter16.S b/third_party/dav1d/src/arm/64/loopfilter16.S new file mode 100644 index 0000000000..d181a3e623 --- /dev/null +++ b/third_party/dav1d/src/arm/64/loopfilter16.S @@ -0,0 +1,925 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels +.macro loop_filter wd +function lpf_8_wd\wd\()_neon + uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0) + uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1) +.if \wd >= 6 + uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1) + uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1) +.endif +.if \wd >= 8 + uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2) + uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3) +.endif +.if \wd >= 6 + umax v4.8h, v4.8h, v5.8h +.endif + uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2 +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + ushr v3.8h, v3.8h, #1 +.if \wd >= 8 + umax v4.8h, v4.8h, v6.8h +.endif +.if \wd >= 6 + and v4.16b, v4.16b, v14.16b +.endif + umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) + uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + umax v4.8h, v0.8h, v4.8h + cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + and v1.16b, v1.16b, v2.16b // fm + and v1.16b, v1.16b, v13.16b // fm && wd >= 4 +.if \wd >= 6 + and v14.16b, v14.16b, v1.16b // fm && wd > 4 +.endif +.if \wd >= 16 + and v15.16b, v15.16b, v1.16b // fm && wd == 16 +.endif + + mov x16, v1.d[0] + mov x17, v1.d[1] + adds x16, x16, x17 + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: +.if \wd >= 6 + movi v10.8h, #1 + uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) + uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0) + dup v9.8h, w9 // bitdepth_min_8 +.if \wd >= 8 + uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) + uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0) +.endif + umax v2.8h, v2.8h, v3.8h + umax v4.8h, v4.8h, v5.8h +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + umax v2.8h, v2.8h, v4.8h + ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8 +.if \wd >= 8 + umax v2.8h, v2.8h, v6.8h +.endif + +.if \wd == 16 + uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0) + uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0) + uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0) +.endif + cmhs v2.8h, v10.8h, v2.8h // flat8in +.if \wd == 16 + uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0) + uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0) + uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0) +.endif + and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 + bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in +.if \wd == 16 + umax v3.8h, v3.8h, v4.8h + umax v5.8h, v5.8h, v6.8h +.endif + mov x16, v1.d[0] + mov x17, v1.d[1] +.if \wd == 16 + umax v7.8h, v7.8h, v8.8h + umax v3.8h, v3.8h, v5.8h + umax v3.8h, v3.8h, v7.8h + cmhs v3.8h, v10.8h, v3.8h // flat8out +.endif + adds x16, x16, x17 +.if \wd == 16 + and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 + and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 + bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out +.endif + b.eq 1f // skip wd == 4 case +.endif + + dup v3.8h, w8 // bitdepth_max + sub v2.8h, v22.8h, v25.8h // p1 - q1 + ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1 + cmhi v0.8h, v0.8h, v12.8h // hev + not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8) + smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1) + smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1) + and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) + sub v2.8h, v24.8h, v23.8h + movi v5.8h, #3 + bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) + mul v2.8h, v2.8h, v5.8h + movi v6.8h, #4 + add v2.8h, v2.8h, v4.8h + smin v2.8h, v2.8h, v3.8h // f = iclip_diff() + smax v2.8h, v2.8h, v9.8h // f = iclip_diff() + sqadd v4.8h, v6.8h, v2.8h // f + 4 + sqadd v5.8h, v5.8h, v2.8h // f + 3 + smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) + smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) + sshr v4.8h, v4.8h, #3 // f1 + sshr v5.8h, v5.8h, #3 // f2 + movi v9.8h, #0 + dup v3.8h, w8 // bitdepth_max + sqadd v2.8h, v23.8h, v5.8h // p0 + f2 + sqsub v6.8h, v24.8h, v4.8h // q0 - f1 + srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1 + smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel() + bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) + bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) + sqadd v2.8h, v22.8h, v4.8h // p1 + f + sqsub v6.8h, v25.8h, v4.8h // q1 - f + smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel() + bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 2f // skip if there's no flat8in + + add v0.8h, v21.8h, v21.8h // p2 * 2 + add v2.8h, v21.8h, v22.8h // p2 + p1 + add v4.8h, v22.8h, v23.8h // p1 + p0 + add v6.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v0.8h, v2.8h + add v10.8h, v4.8h, v6.8h + add v12.8h, v24.8h, v25.8h // q0 + q1 + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v0.8h + add v10.8h, v25.8h, v26.8h // q1 + q2 + urshr v0.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v12.8h + sub v10.8h, v10.8h, v2.8h + add v12.8h, v26.8h, v26.8h // q2 + q2 + urshr v1.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v4.8h + urshr v2.8h, v8.8h, #3 // out q0 + + bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) + add v8.8h, v8.8h, v12.8h + bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) + urshr v3.8h, v8.8h, #3 // out q1 + bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) + bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) +.elseif \wd >= 8 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 +.if \wd == 8 + b.eq 8f // skip if there's no flat8in +.else + b.eq 2f // skip if there's no flat8in +.endif + + add v0.8h, v20.8h, v21.8h // p3 + p2 + add v2.8h, v22.8h, v25.8h // p1 + q1 + add v4.8h, v20.8h, v22.8h // p3 + p1 + add v6.8h, v23.8h, v26.8h // p0 + q2 + add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) + add v9.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v8.8h, v4.8h // + p3 + p1 + sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 + add v8.8h, v8.8h, v9.8h // + p0 + q0 + sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 + urshr v10.8h, v8.8h, #3 // out p2 + + add v8.8h, v8.8h, v2.8h + add v0.8h, v20.8h, v23.8h // p3 + p0 + add v2.8h, v24.8h, v27.8h // q0 + q3 + urshr v11.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 + add v4.8h, v21.8h, v24.8h // p2 + q0 + add v6.8h, v25.8h, v27.8h // q1 + q3 + urshr v12.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v2.8h + sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 + add v0.8h, v22.8h, v25.8h // p1 + q1 + add v2.8h, v26.8h, v27.8h // q2 + q3 + urshr v13.8h, v8.8h, #3 // out q0 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 + urshr v0.8h, v8.8h, #3 // out q1 + + add v8.8h, v8.8h, v2.8h + + bit v21.16b, v10.16b, v14.16b + bit v22.16b, v11.16b, v14.16b + bit v23.16b, v12.16b, v14.16b + urshr v1.8h, v8.8h, #3 // out q2 + bit v24.16b, v13.16b, v14.16b + bit v25.16b, v0.16b, v14.16b + bit v26.16b, v1.16b, v14.16b +.endif +2: +.if \wd == 16 + mov x16, v15.d[0] + mov x17, v15.d[1] + adds x16, x16, x17 + b.ne 1f // check if flat8out is needed + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + add v2.8h, v17.8h, v17.8h // p6 + p6 + add v4.8h, v17.8h, v18.8h // p6 + p5 + add v6.8h, v17.8h, v19.8h // p6 + p4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + add v12.8h, v2.8h, v4.8h + add v10.8h, v6.8h, v8.8h + add v6.8h, v17.8h, v21.8h // p6 + p2 + add v12.8h, v12.8h, v10.8h + add v8.8h, v17.8h, v22.8h // p6 + p1 + add v10.8h, v18.8h, v23.8h // p5 + p0 + add v6.8h, v6.8h, v8.8h + add v8.8h, v19.8h, v24.8h // p4 + q0 + add v12.8h, v12.8h, v6.8h + add v10.8h, v10.8h, v8.8h + add v6.8h, v20.8h, v25.8h // p3 + q1 + add v12.8h, v12.8h, v10.8h + sub v6.8h, v6.8h, v2.8h + add v2.8h, v21.8h, v26.8h // p2 + q2 + urshr v0.8h, v12.8h, #4 // out p5 + add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) + sub v2.8h, v2.8h, v4.8h + add v4.8h, v22.8h, v27.8h // p1 + q3 + add v6.8h, v17.8h, v19.8h // p6 + p4 + urshr v1.8h, v12.8h, #4 // out p4 + add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) + sub v4.8h, v4.8h, v6.8h + add v6.8h, v23.8h, v28.8h // p0 + q4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + urshr v2.8h, v12.8h, #4 // out p3 + add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) + sub v6.8h, v6.8h, v8.8h + add v8.8h, v24.8h, v29.8h // q0 + q5 + add v4.8h, v17.8h, v21.8h // p6 + p2 + urshr v3.8h, v12.8h, #4 // out p2 + add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) + sub v8.8h, v8.8h, v4.8h + add v6.8h, v25.8h, v30.8h // q1 + q6 + add v10.8h, v17.8h, v22.8h // p6 + p1 + urshr v4.8h, v12.8h, #4 // out p1 + add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) + sub v6.8h, v6.8h, v10.8h + add v8.8h, v26.8h, v30.8h // q2 + q6 + bif v0.16b, v18.16b, v15.16b // out p5 + add v10.8h, v18.8h, v23.8h // p5 + p0 + urshr v5.8h, v12.8h, #4 // out p0 + add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) + sub v8.8h, v8.8h, v10.8h + add v10.8h, v27.8h, v30.8h // q3 + q6 + bif v1.16b, v19.16b, v15.16b // out p4 + add v18.8h, v19.8h, v24.8h // p4 + q0 + urshr v6.8h, v12.8h, #4 // out q0 + add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) + sub v10.8h, v10.8h, v18.8h + add v8.8h, v28.8h, v30.8h // q4 + q6 + bif v2.16b, v20.16b, v15.16b // out p3 + add v18.8h, v20.8h, v25.8h // p3 + q1 + urshr v7.8h, v12.8h, #4 // out q1 + add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) + sub v18.8h, v8.8h, v18.8h + add v10.8h, v29.8h, v30.8h // q5 + q6 + bif v3.16b, v21.16b, v15.16b // out p2 + add v20.8h, v21.8h, v26.8h // p2 + q2 + urshr v8.8h, v12.8h, #4 // out q2 + add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) + sub v10.8h, v10.8h, v20.8h + add v18.8h, v30.8h, v30.8h // q6 + q6 + bif v4.16b, v22.16b, v15.16b // out p1 + add v20.8h, v22.8h, v27.8h // p1 + q3 + urshr v9.8h, v12.8h, #4 // out q3 + add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) + sub v18.8h, v18.8h, v20.8h + bif v5.16b, v23.16b, v15.16b // out p0 + urshr v10.8h, v12.8h, #4 // out q4 + add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) + urshr v11.8h, v12.8h, #4 // out q5 + bif v6.16b, v24.16b, v15.16b // out q0 + bif v7.16b, v25.16b, v15.16b // out q1 + bif v8.16b, v26.16b, v15.16b // out q2 + bif v9.16b, v27.16b, v15.16b // out q3 + bif v10.16b, v28.16b, v15.16b // out q4 + bif v11.16b, v29.16b, v15.16b // out q5 +.endif + + mov x14, #0 + ret +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + mov x14, #(1 << 6) + ret +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + mov x14, #(1 << 4) + ret +.endif +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_8_wd16 + bl lpf_8_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: +.endm + +.macro lpf_8_wd8 + bl lpf_8_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: +.endm + +.macro lpf_8_wd6 + bl lpf_8_wd6_neon + cbz x14, 1f + ret x15 +1: +.endm + +.macro lpf_8_wd4 + bl lpf_8_wd4_neon + cbz x14, 1f + ret x15 +1: +.endm + +function lpf_v_4_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_4_8_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #2 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +endfunc + +function lpf_v_6_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_6_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +endfunc + +function lpf_v_8_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + ret x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_8_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + ret x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +endfunc + +function lpf_v_16_8_neon + mov x15, x30 + + sub x16, x0, x1, lsl #3 + add x16, x16, x1 + ld1 {v17.8h}, [x16], x1 // p6 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v18.8h}, [x16], x1 // p5 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v19.8h}, [x16], x1 // p4 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v27.8h}, [x0], x1 // q3 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v28.8h}, [x0], x1 // q4 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v29.8h}, [x0], x1 // q5 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v30.8h}, [x0], x1 // q6 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + lpf_8_wd16 + + sub x16, x0, x1, lsl #2 + sub x16, x16, x1, lsl #1 + st1 {v0.8h}, [x16], x1 // p5 + st1 {v6.8h}, [x0], x1 // q0 + st1 {v1.8h}, [x16], x1 // p4 + st1 {v7.8h}, [x0], x1 // q1 + st1 {v2.8h}, [x16], x1 // p3 + st1 {v8.8h}, [x0], x1 // q2 + st1 {v3.8h}, [x16], x1 // p2 + st1 {v9.8h}, [x0], x1 // q3 + st1 {v4.8h}, [x16], x1 // p1 + st1 {v10.8h}, [x0], x1 // q4 + st1 {v5.8h}, [x16], x1 // p0 + st1 {v11.8h}, [x0], x1 // q5 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ret x15 +7: + sub x16, x0, x1 + sub x16, x16, x1, lsl #1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + ret x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + ret x15 +endfunc + +function lpf_h_16_8_neon + mov x15, x30 + sub x16, x0, #16 + ld1 {v16.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v17.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v18.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v19.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + ld1 {v20.8h}, [x16], x1 + ld1 {v28.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v29.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v30.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v31.8h}, [x0], x1 + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lpf_8_wd16 + + sub x0, x0, x1, lsl #3 + sub x16, x0, #16 + + transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 + transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 + + st1 {v16.8h}, [x16], x1 + st1 {v6.8h}, [x0], x1 + st1 {v17.8h}, [x16], x1 + st1 {v7.8h}, [x0], x1 + st1 {v0.8h}, [x16], x1 + st1 {v8.8h}, [x0], x1 + st1 {v1.8h}, [x16], x1 + st1 {v9.8h}, [x0], x1 + st1 {v2.8h}, [x16], x1 + st1 {v10.8h}, [x0], x1 + st1 {v3.8h}, [x16], x1 + st1 {v11.8h}, [x0], x1 + st1 {v4.8h}, [x16], x1 + st1 {v30.8h}, [x0], x1 + st1 {v5.8h}, [x16], x1 + st1 {v31.8h}, [x0], x1 + ret x15 + +7: + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + ret x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + ret x15 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + mov x11, x30 + mov w8, w7 // bitdepth_max + clz w9, w8 + mov w10, #24 + sub w9, w10, w9 // bitdepth_min_8 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp w6, w7, [x2] // vmask[0], vmask[1] +.ifc \type, y + ldr w2, [x2, #8] // vmask[2] +.endif + add x5, x5, #128 // Move to sharp part of lut +.ifc \type, y + orr w7, w7, w2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub x4, x3, x4, lsl #2 +.else + sub x3, x3, #4 + lsl x4, x4, #2 +.endif + orr w6, w6, w7 // vmask[0] |= vmask[1] + +1: + tst w6, #0x03 +.ifc \dir, v + ld1 {v0.8b}, [x4], #8 + ld1 {v1.8b}, [x3], #8 +.else + ld2 {v0.s,v1.s}[0], [x3], x4 + ld2 {v0.s,v1.s}[1], [x3], x4 +.endif + b.eq 7f // if (!(vm & bits)) continue; + + ld1r {v5.8b}, [x5] // sharp[0] + add x5, x5, #8 + movi v2.2s, #0xff + dup v13.2s, w6 // vmask[0] + dup v31.8h, w9 // bitdepth_min_8 + + and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word + and v1.8b, v1.8b, v2.8b + cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0] + movi v4.8b, #1 + ld1r {v6.8b}, [x5] // sharp[1] + sub x5, x5, #8 + bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] + cmtst v2.2s, v1.2s, v2.2s // L != 0 + mul v1.2s, v1.2s, v4.2s // L +.ifc \type, y + dup v15.2s, w2 // vmask[2] +.endif + dup v14.2s, w7 // vmask[1] + mov x16, v2.d[0] + cmp x16, #0 + b.eq 7f // if (!L) continue; + neg v5.8b, v5.8b // -sharp[0] + movrel x16, word_12 + ushr v12.8b, v1.8b, #4 // H + ld1 {v16.2s}, [x16] + sshl v3.8b, v1.8b, v5.8b // L >> sharp[0] +.ifc \type, y + cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits) +.endif + movi v7.8b, #2 + umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1]) + add v0.8b, v1.8b, v7.8b // L + 2 + umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I + add v0.8b, v0.8b, v0.8b // 2*(L + 2) + cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits) + uxtl v12.8h, v12.8b + add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E + cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits) + uxtl v11.8h, v11.8b + uxtl v10.8h, v10.8b + and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0 + sxtl v14.8h, v14.8b + sxtl v13.8h, v13.8b +.ifc \type, y + sxtl v15.8h, v15.8b +.endif + ushl v12.8h, v12.8h, v31.8h + ushl v11.8h, v11.8h, v31.8h + ushl v10.8h, v10.8h, v31.8h + +.ifc \type, y + tst w2, #0x03 + b.eq 2f + // wd16 + bl lpf_\dir\()_16_8_neon + b 8f +2: +.endif + tst w7, #0x03 + b.eq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_8_neon +.else + // wd6 + bl lpf_\dir\()_6_8_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_8_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment x0. + // If the whole function is skipped, increment it here instead. + add x0, x0, x1, lsl #3 +.else +7: +.endif +8: + lsr w6, w6, #2 // vmask[0] >>= 2 + lsr w7, w7, #2 // vmask[1] >>= 2 +.ifc \type, y + lsr w2, w2, #2 // vmask[2] >>= 2 +.endif +.ifc \dir, v + add x0, x0, #16 +.else + // For dir h, x0 is returned incremented +.endif + cbnz w6, 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret x11 +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_12 + .word 1, 2 +endconst diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S new file mode 100644 index 0000000000..a598b72b03 --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration.S @@ -0,0 +1,1336 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter7_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 + movi v31.8h, #8, lsl #8 + + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x1 // lpf += stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_8bpc_neon + subs w5, w5, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + mov x13, x14 // t2 + subs w5, w5, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_7) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter7_hv_8bpc_neon + bl wiener_filter7_hv_8bpc_neon +L(v1_7): + bl wiener_filter7_v_8bpc_neon + + mov sp, x29 + ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_7): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_8bpc_neon + subs w5, w5, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_8bpc_neon +L(v2_7): + bl wiener_filter7_v_8bpc_neon + b L(v1_7) +endfunc + + +function wiener_filter7_h_8bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + b 2f + +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x6, x6, w4, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 + mul v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w7, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter7_v_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x4, [sp, #48] +1: + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + + ld1 {v18.8h, v19.8h}, [x10], #32 + add v24.8h, v24.8h, v20.8h + ld1 {v26.8h, v27.8h}, [x14], #32 + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v28.8h, v26.8h, v18.8h + ld1 {v22.8h, v23.8h}, [x12], #32 + + add v16.8h, v26.8h, v16.8h + add v25.8h, v25.8h, v21.8h + + smull v2.4s, v22.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v28.4h, v1.h[5] + smlal v2.4s, v16.4h, v1.h[6] + add v29.8h, v27.8h, v19.8h + smull2 v3.4s, v22.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v28.8h, v1.h[5] + smlal2 v3.4s, v16.8h, v1.h[6] + add v17.8h, v27.8h, v17.8h + smull v4.4s, v23.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v29.4h, v1.h[5] + smlal v4.4s, v17.4h, v1.h[6] + smull2 v5.4s, v23.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v29.8h, v1.h[5] + smlal2 v5.4s, v17.8h, v1.h[6] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w4, w4, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x4, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x4, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x6, x6, w4, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 + mul v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + ld1 {v20.8h, v21.8h}, [x11], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v26.8h, v27.8h}, [x13], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v18.8h, v19.8h}, [x10], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v28.8h, v29.8h}, [x14], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v26.8h, v20.8h, v26.8h + + ld1 {v24.8h, v25.8h}, [x12], #32 + add v28.8h, v18.8h, v28.8h + + add v16.8h, v16.8h, v6.8h + add v27.8h, v21.8h, v27.8h + + smull v18.4s, v24.4h, v1.h[3] + smlal v18.4s, v26.4h, v1.h[4] + smlal v18.4s, v28.4h, v1.h[5] + smlal v18.4s, v16.4h, v1.h[6] + add v29.8h, v19.8h, v29.8h + smull2 v19.4s, v24.8h, v1.h[3] + smlal2 v19.4s, v26.8h, v1.h[4] + smlal2 v19.4s, v28.8h, v1.h[5] + smlal2 v19.4s, v16.8h, v1.h[6] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v25.4h, v1.h[3] + smlal v20.4s, v27.4h, v1.h[4] + smlal v20.4s, v29.4h, v1.h[5] + smlal v20.4s, v17.4h, v1.h[6] + smull2 v21.4s, v25.8h, v1.h[3] + smlal2 v21.4s, v27.8h, v1.h[4] + smlal2 v21.4s, v29.8h, v1.h[5] + smlal2 v21.4s, v17.8h, v1.h[6] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w4, w4, #16 + + st1 {v18.16b}, [x0], #16 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w7, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter5_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 + movi v31.8h, #8, lsl #8 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x1 // lpf += stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_8bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_5) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter5_hv_8bpc_neon + bl wiener_filter5_hv_8bpc_neon +L(end_5): + + mov sp, x29 + ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_5): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_8bpc_neon + subs w5, w5, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_8bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_8bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_8bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_8bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f + +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x6, x6, w4, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w7, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter5_v_8bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x4, [sp, #32] +1: + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + ld1 {v16.8h, v17.8h}, [x11], #32 + + add v24.8h, v22.8h, v18.8h + ld1 {v20.8h, v21.8h}, [x13], #32 + add v16.8h, v22.8h, v16.8h + add v25.8h, v23.8h, v19.8h + + smull v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v16.4h, v1.h[5] + add v17.8h, v23.8h, v17.8h + smull2 v3.4s, v20.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v16.8h, v1.h[5] + smull v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v17.4h, v1.h[5] + smull2 v5.4s, v21.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v17.8h, v1.h[5] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w4, w4, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x4, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + + ret +endfunc + +function wiener_filter5_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x4, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 2x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x6, x6, w4, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + ld1 {v18.8h, v19.8h}, [x12], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v24.8h, v25.8h}, [x14], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v16.8h, v17.8h}, [x11], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v20.8h, v21.8h}, [x13], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + add v24.8h, v24.8h, v18.8h + add v16.8h, v16.8h, v6.8h + + smull v18.4s, v20.4h, v1.h[3] + smlal v18.4s, v24.4h, v1.h[4] + smlal v18.4s, v16.4h, v1.h[5] + add v25.8h, v25.8h, v19.8h + smull2 v19.4s, v20.8h, v1.h[3] + smlal2 v19.4s, v24.8h, v1.h[4] + smlal2 v19.4s, v16.8h, v1.h[5] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v21.4h, v1.h[3] + smlal v20.4s, v25.4h, v1.h[4] + smlal v20.4s, v17.4h, v1.h[5] + smull2 v21.4s, v21.8h, v1.h[3] + smlal2 v21.4s, v25.8h, v1.h[4] + smlal2 v21.4s, v17.8h, v1.h[5] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w4, w4, #16 + + st1 {v18.16b}, [x0], #16 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w7, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_8bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w13, #8 + sub x4, x4, w13, uxtw + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + sub x12, x12, #2 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #2 + + +1: // Loop vertically + ld1 {v0.16b}, [x3], #16 + ld1 {v4.16b}, [x12], #16 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.s}[3], [x2], #4 + // Move x3/x12 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + sub x12, x12, #2 + ld1 {v5.s}[3], [x2], #4 + ext v0.16b, v1.16b, v0.16b, #14 + ext v4.16b, v5.16b, v4.16b, #14 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 2x the first byte at the front. + dup v1.16b, v0.b[0] + dup v5.16b, v4.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + sub x12, x12, #2 + ext v0.16b, v1.16b, v0.16b, #14 + ext v4.16b, v5.16b, v4.16b, #14 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr b30, [x3, w13, sxtw] + ldr b31, [x12, w13, sxtw] + // Fill v30/v31 with the right padding pixel + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v18.16b, v4.16b, v4.16b, #1 + ext v19.16b, v4.16b, v4.16b, #2 + uaddl v3.8h, v0.8b, v16.8b + uaddw v3.8h, v3.8h, v17.8b + uaddl v7.8h, v4.8b, v18.8b + uaddw v7.8h, v7.8h, v19.8b + + ext v20.16b, v1.16b, v2.16b, #2 + ext v21.16b, v1.16b, v2.16b, #4 + ext v22.16b, v5.16b, v6.16b, #2 + ext v23.16b, v5.16b, v6.16b, #4 + + uaddl v26.4s, v1.4h, v20.4h + uaddl2 v27.4s, v1.8h, v20.8h + uaddw v26.4s, v26.4s, v21.4h + uaddw2 v27.4s, v27.4s, v21.8h + + uaddl v28.4s, v5.4h, v22.4h + uaddl2 v29.4s, v5.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + + subs w5, w5, #8 + + st1 {v3.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v26.4s,v27.4s}, [x0], #32 + st1 {v28.4s,v29.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x3], #8 + ld1 {v7.8b}, [x12], #8 + mov v1.16b, v2.16b + mov v5.16b, v6.16b + ext v0.16b, v0.16b, v3.16b, #8 + ext v4.16b, v4.16b, v7.16b, #8 + umull v2.8h, v3.8b, v3.8b + umull v6.8h, v7.8b, v7.8b + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_8bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + add w13, w13, #8 + sub x4, x4, w13, uxtw + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + sub x12, x12, #3 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #3 + +1: // Loop vertically + ld1 {v0.16b}, [x3], #16 + ld1 {v4.16b}, [x12], #16 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v1.s}[3], [x2], #4 + // Move x3/x12 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + sub x12, x12, #3 + ld1 {v5.s}[3], [x2], #4 + ext v0.16b, v1.16b, v0.16b, #13 + ext v4.16b, v5.16b, v4.16b, #13 + b 2f +0: + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 3x the first byte at the front. + dup v1.16b, v0.b[0] + dup v5.16b, v4.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + sub x12, x12, #3 + ext v0.16b, v1.16b, v0.16b, #13 + ext v4.16b, v5.16b, v4.16b, #13 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr b30, [x3, w13, sxtw] + ldr b31, [x12, w13, sxtw] + // Fill v30/v31 with the right padding pixel + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v18.16b, v0.16b, v0.16b, #3 + ext v19.16b, v0.16b, v0.16b, #4 + ext v20.16b, v4.16b, v4.16b, #1 + ext v21.16b, v4.16b, v4.16b, #2 + ext v22.16b, v4.16b, v4.16b, #3 + ext v23.16b, v4.16b, v4.16b, #4 + uaddl v3.8h, v0.8b, v16.8b + uaddl v24.8h, v17.8b, v18.8b + uaddl v7.8h, v4.8b, v20.8b + uaddw v3.8h, v3.8h, v19.8b + uaddl v25.8h, v21.8b, v22.8b + uaddw v7.8h, v7.8h, v23.8b + add v3.8h, v3.8h, v24.8h + add v7.8h, v7.8h, v25.8h + + ext v16.16b, v1.16b, v2.16b, #2 + ext v17.16b, v1.16b, v2.16b, #4 + ext v18.16b, v1.16b, v2.16b, #6 + ext v19.16b, v1.16b, v2.16b, #8 + ext v20.16b, v5.16b, v6.16b, #2 + ext v21.16b, v5.16b, v6.16b, #4 + ext v22.16b, v5.16b, v6.16b, #6 + ext v23.16b, v5.16b, v6.16b, #8 + + uaddl v26.4s, v1.4h, v16.4h + uaddl2 v27.4s, v1.8h, v16.8h + uaddl v16.4s, v17.4h, v18.4h + uaddl2 v17.4s, v17.8h, v18.8h + uaddl v28.4s, v5.4h, v20.4h + uaddl2 v29.4s, v5.8h, v20.8h + uaddw v26.4s, v26.4s, v19.4h + uaddw2 v27.4s, v27.4s, v19.8h + uaddl v20.4s, v21.4h, v22.4h + uaddl2 v21.4s, v21.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + add v26.4s, v26.4s, v16.4s + add v27.4s, v27.4s, v17.4s + add v28.4s, v28.4s, v20.4s + add v29.4s, v29.4s, v21.4s + + subs w5, w5, #8 + + st1 {v3.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v26.4s,v27.4s}, [x0], #32 + st1 {v28.4s,v29.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x3], #8 + ld1 {v7.8b}, [x12], #8 + mov v1.16b, v2.16b + mov v5.16b, v6.16b + ext v0.16b, v0.16b, v3.16b, #8 + ext v4.16b, v4.16b, v7.16b, #8 + umull v2.8h, v3.8b, v3.8b + umull v6.8h, v7.8b, v7.8b + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +sgr_funcs 8 diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S new file mode 100644 index 0000000000..8954e604cf --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration16.S @@ -0,0 +1,1419 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter7_16bpc_neon, export=1 + ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + dup v28.8h, w8 // bitdepth_max + clz w8, w8 + movi v30.4s, #1 + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w8 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x1 // lpf += stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + mov x13, x14 // t2 + subs w5, w5, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_7) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter7_hv_16bpc_neon + bl wiener_filter7_hv_16bpc_neon +L(v1_7): + bl wiener_filter7_v_16bpc_neon + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_7): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_16bpc_neon +L(v2_7): + bl wiener_filter7_v_16bpc_neon + b L(v1_7) +endfunc + + +function wiener_filter7_h_16bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v16.4s, v18.4h, v0.h[3] + smlal v16.4s, v19.4h, v0.h[2] + smlal v16.4s, v20.4h, v0.h[1] + smlal v16.4s, v21.4h, v0.h[0] + smull2 v17.4s, v18.8h, v0.h[3] + smlal2 v17.4s, v19.8h, v0.h[2] + smlal2 v17.4s, v20.8h, v0.h[1] + smlal2 v17.4s, v21.8h, v0.h[0] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter7_v_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x4, [sp, #48] +1: + ld1 {v16.8h, v17.8h}, [x9], #32 + ld1 {v18.8h, v19.8h}, [x10], #32 + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v22.8h, v23.8h}, [x12], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + ld1 {v6.8h, v7.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[4] + smlal v2.4s, v18.4h, v0.h[5] + smlal v2.4s, v20.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[7] + smlal v2.4s, v24.4h, v0.h[6] + smlal v2.4s, v6.4h, v0.h[5] + smlal v2.4s, v6.4h, v0.h[4] + smull2 v3.4s, v16.8h, v0.h[4] + smlal2 v3.4s, v18.8h, v0.h[5] + smlal2 v3.4s, v20.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[7] + smlal2 v3.4s, v24.8h, v0.h[6] + smlal2 v3.4s, v6.8h, v0.h[5] + smlal2 v3.4s, v6.8h, v0.h[4] + smull v4.4s, v17.4h, v0.h[4] + smlal v4.4s, v19.4h, v0.h[5] + smlal v4.4s, v21.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[7] + smlal v4.4s, v25.4h, v0.h[6] + smlal v4.4s, v7.4h, v0.h[5] + smlal v4.4s, v7.4h, v0.h[4] + smull2 v5.4s, v17.8h, v0.h[4] + smlal2 v5.4s, v19.8h, v0.h[5] + smlal2 v5.4s, v21.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[7] + smlal2 v5.4s, v25.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + smlal2 v5.4s, v7.8h, v0.h[4] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + subs w4, w4, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x4, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x4, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w4, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v24.4s, v18.4h, v0.h[3] + smlal v24.4s, v19.4h, v0.h[2] + smlal v24.4s, v20.4h, v0.h[1] + smlal v24.4s, v21.4h, v0.h[0] + smull2 v25.4s, v18.8h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[2] + smlal2 v25.4s, v20.8h, v0.h[1] + smlal2 v25.4s, v21.8h, v0.h[0] + + ld1 {v16.8h, v17.8h}, [x9], #32 + + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x10], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x11], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x12], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + ld1 {v24.8h, v25.8h}, [x13], #32 + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + ld1 {v8.8h, v9.8h}, [x14], #32 + + smull v1.4s, v16.4h, v0.h[4] + smlal v1.4s, v18.4h, v0.h[5] + smlal v1.4s, v20.4h, v0.h[6] + smlal v1.4s, v22.4h, v0.h[7] + smlal v1.4s, v24.4h, v0.h[6] + smlal v1.4s, v8.4h, v0.h[5] + smlal v1.4s, v6.4h, v0.h[4] + smull2 v5.4s, v16.8h, v0.h[4] + smlal2 v5.4s, v18.8h, v0.h[5] + smlal2 v5.4s, v20.8h, v0.h[6] + smlal2 v5.4s, v22.8h, v0.h[7] + smlal2 v5.4s, v24.8h, v0.h[6] + smlal2 v5.4s, v8.8h, v0.h[5] + smlal2 v5.4s, v6.8h, v0.h[4] + smull v26.4s, v17.4h, v0.h[4] + smlal v26.4s, v19.4h, v0.h[5] + smlal v26.4s, v21.4h, v0.h[6] + smlal v26.4s, v23.4h, v0.h[7] + smlal v26.4s, v25.4h, v0.h[6] + smlal v26.4s, v9.4h, v0.h[5] + smlal v26.4s, v7.4h, v0.h[4] + smull2 v16.4s, v17.8h, v0.h[4] + smlal2 v16.4s, v19.8h, v0.h[5] + smlal2 v16.4s, v21.8h, v0.h[6] + smlal2 v16.4s, v23.8h, v0.h[7] + smlal2 v16.4s, v25.8h, v0.h[6] + smlal2 v16.4s, v9.8h, v0.h[5] + smlal2 v16.4s, v7.8h, v0.h[4] + srshl v1.4s, v1.4s, v27.4s // -round_bits_v + srshl v5.4s, v5.4s, v27.4s + srshl v26.4s, v26.4s, v27.4s + srshl v16.4s, v16.4s, v27.4s + sqxtun v18.4h, v1.4s + sqxtun2 v18.8h, v5.4s + sqxtun v19.4h, v26.4s + sqxtun2 v19.8h, v16.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v18.8h, v18.8h, v28.8h // bitdepth_max + umin v19.8h, v19.8h, v28.8h + subs w4, w4, #16 + + st1 {v18.8h, v19.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], const pixel *lpf, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter5_16bpc_neon, export=1 + ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + dup v28.8h, w8 // bitdepth_max + clz w8, w8 + movi v30.4s, #1 + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w8 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x1 // lpf += stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_loop_5) + tst w7, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + bl wiener_filter5_hv_16bpc_neon + bl wiener_filter5_hv_16bpc_neon +L(end_5): + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(no_top_5): + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_16bpc_neon + subs w5, w5, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_16bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_16bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_16bpc_neon + stp x3, x4, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v16.4s, v17.4h, v0.h[3] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[1] + smull2 v17.4s, v17.8h, v0.h[3] + smlal2 v17.4s, v18.8h, v0.h[2] + smlal2 v17.4s, v19.8h, v0.h[1] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w4, w4, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x4, [sp], #32 + ret +endfunc + +function wiener_filter5_v_16bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x4, [sp, #32] +1: + ld1 {v16.8h, v17.8h}, [x11], #32 + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v20.8h, v21.8h}, [x13], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[5] + smlal v2.4s, v18.4h, v0.h[6] + smlal v2.4s, v20.4h, v0.h[7] + smlal v2.4s, v22.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[5] + smull2 v3.4s, v16.8h, v0.h[5] + smlal2 v3.4s, v18.8h, v0.h[6] + smlal2 v3.4s, v20.8h, v0.h[7] + smlal2 v3.4s, v22.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[5] + smull v4.4s, v17.4h, v0.h[5] + smlal v4.4s, v19.4h, v0.h[6] + smlal v4.4s, v21.4h, v0.h[7] + smlal v4.4s, v23.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[5] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + + subs w4, w4, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x4, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + + ret +endfunc + +function wiener_filter5_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x4, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 2x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w4, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w4, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x6, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x6, x6, w4, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v24.4s, v17.4h, v0.h[3] + smlal v24.4s, v18.4h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[1] + smull2 v25.4s, v17.8h, v0.h[3] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal2 v25.4s, v19.8h, v0.h[1] + + ld1 {v16.8h, v17.8h}, [x11], #32 + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x12], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x13], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x14], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + smull v8.4s, v16.4h, v0.h[5] + smlal v8.4s, v18.4h, v0.h[6] + smlal v8.4s, v20.4h, v0.h[7] + smlal v8.4s, v22.4h, v0.h[6] + smlal v8.4s, v6.4h, v0.h[5] + smull2 v9.4s, v16.8h, v0.h[5] + smlal2 v9.4s, v18.8h, v0.h[6] + smlal2 v9.4s, v20.8h, v0.h[7] + smlal2 v9.4s, v22.8h, v0.h[6] + smlal2 v9.4s, v6.8h, v0.h[5] + smull v1.4s, v17.4h, v0.h[5] + smlal v1.4s, v19.4h, v0.h[6] + smlal v1.4s, v21.4h, v0.h[7] + smlal v1.4s, v23.4h, v0.h[6] + smlal v1.4s, v7.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + srshl v8.4s, v8.4s, v27.4s // -round_bits_v + srshl v9.4s, v9.4s, v27.4s + srshl v1.4s, v1.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v8.4h, v8.4s + sqxtun2 v8.8h, v9.4s + sqxtun v9.4h, v1.4s + sqxtun2 v9.8h, v5.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v8.8h, v8.8h, v28.8h // bitdepth_max + umin v9.8h, v9.8h, v28.8h + + subs w4, w4, #16 + + st1 {v8.8h, v9.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x4, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Subtract the number of pixels read from the input from the stride + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + sub x12, x12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #4 + + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + sub x12, x12, #4 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 2x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + sub x12, x12, #4 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 + ext v17.16b, v16.16b, v17.16b, #12 + ext v16.16b, v18.16b, v16.16b, #12 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 2 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #10 + b.ge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b + +4: // Loop horizontally + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + subs w5, w5, #8 + + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + add w5, w5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add x10, x0, #(4*SUM_STRIDE) // sumsq + add x11, x1, #(2*SUM_STRIDE) // sum + add x12, x3, x4 // src + lsl x4, x4, #1 + mov x9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add w13, w5, #7 + bic w13, w13, #7 + sub x9, x9, w13, uxtw #1 + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 + + // Store the width for the vertical loop + mov w8, w5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w7, #1 // LR_HAVE_LEFT + b.eq 2f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + sub x12, x12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add x4, x4, #6 + +1: // Loop vertically + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x12], #32 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 0f + cbz x2, 2f + // LR_HAVE_LEFT, left != NULL + ld1 {v2.d}[1], [x2], #8 + // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + sub x12, x12, #6 + ld1 {v18.d}[1], [x2], #8 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + dup v18.8h, v16.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + sub x12, x12, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + ext v17.16b, v16.16b, v17.16b, #10 + ext v16.16b, v18.16b, v16.16b, #10 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w5, #(2 + 16 - 3 + 1) + ldr h30, [x3, w13, sxtw #1] + ldr h31, [x12, w13, sxtw #1] + // Fill v30/v31 with the right padding pixel + dup v30.8h, v30.h[0] + dup v31.8h, v31.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -2 + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b + +4: // Loop horizontally + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + ext v26.16b, v0.16b, v1.16b, #6 + ext v28.16b, v16.16b, v17.16b, #6 + ext v27.16b, v0.16b, v1.16b, #8 + ext v29.16b, v16.16b, v17.16b, #8 + + add v6.8h, v6.8h, v26.8h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v7.8h, v28.8h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + subs w5, w5, #8 + + st1 {v6.8h}, [x1], #16 + st1 {v7.8h}, [x11], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + st1 {v24.4s,v25.4s}, [x10], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + mov v16.16b, v17.16b + ld1 {v1.8h}, [x3], #16 + ld1 {v17.8h}, [x12], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs w6, w6, #2 + b.le 0f + // Jump to the next row and loop horizontally + add x0, x0, x9, lsl #1 + add x10, x10, x9, lsl #1 + add x1, x1, x9 + add x11, x11, x9 + add x3, x3, x4 + add x12, x12, x4 + mov w5, w8 + b 1b +0: + ret +endfunc + +sgr_funcs 16 diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S new file mode 100644 index 0000000000..200eb63189 --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration_common.S @@ -0,0 +1,432 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #2 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 1f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Sum all h+2 lines with the main loop + add w11, w11, #2 +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v21 and v24-v26 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v24.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.4s, v19.4s}, [x5], x7 + ld1 {v25.8h}, [x6], x8 + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v25.16b, v24.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v26.16b, v24.16b + +3: + subs w3, w3, #1 +.macro add3 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v24.8h, v24.8h, v25.8h + add v16.4s, v16.4s, v20.4s + add v17.4s, v17.4s, v21.4s + add v24.8h, v24.8h, v26.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v24.8h}, [x1], x8 +.endm + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v25.16b, v26.16b + b.le 4f + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3b + +4: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + add3 + +5: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #8 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 0f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Handle h+2 lines with the main loop + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub w3, w3, #1 // Handle h-1 lines with the main loop +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v25 and v26-v30 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v28.8h}, [x6], x8 + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v28.16b, v26.16b + mov v22.16b, v16.16b + mov v23.16b, v17.16b + mov v29.16b, v26.16b + +3: + cbz w3, 4f + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + +3: + // Start of vertical loop + subs w3, w3, #2 +.macro add5 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v26.8h, v26.8h, v27.8h + add v0.4s, v20.4s, v22.4s + add v1.4s, v21.4s, v23.4s + add v2.8h, v28.8h, v29.8h + add v16.4s, v16.4s, v24.4s + add v17.4s, v17.4s, v25.4s + add v26.8h, v26.8h, v30.8h + add v16.4s, v16.4s, v0.4s + add v17.4s, v17.4s, v1.4s + add v26.8h, v26.8h, v2.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v26.8h}, [x1], x8 +.endm + add5 +.macro shift2 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v26.16b, v28.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v27.16b, v29.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v28.16b, v30.16b +.endm + shift2 + add x0, x0, x7 + add x1, x1, x8 + b.le 5f + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + add5 + b 6f + +5: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 6f + // !LR_HAVE_BOTTOM + cbnz w3, 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + // Pad the past-edge row from the last content row. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // w3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v29.16b, v28.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v30.16b, v28.16b + add5 + add x0, x0, x7 + add x1, x1, x8 + b 6f + +6: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + clz w9, w5 + add x3, x3, #2 // h += 2 + movi v31.4s, #9 // n + mov x5, #455 + mov x8, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + clz w9, w5 + add x3, x3, #3 // h += 3 + asr x3, x3, #1 // h /= 2 + movi v31.4s, #25 // n + mov x5, #164 + mov x8, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + sub w9, w9, #24 // -bitdepth_min_8 + movrel x12, X(sgr_x_by_x) + ld1 {v16.16b, v17.16b, v18.16b}, [x12] + dup v6.8h, w9 // -bitdepth_min_8 + movi v19.16b, #5 + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + movi v24.8b, #254 // idx of last 1 + saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 + add x2, x2, #2 // w += 2 + add x7, x2, #7 + bic x7, x7, #7 // aligned w + sub x7, x8, x7 // increment between rows + movi v29.8h, #1, lsl #8 + dup v28.4s, w4 + dup v30.4s, w5 // one_by_x + sub x0, x0, #(4*(SUM_STRIDE)) + sub x1, x1, #(2*(SUM_STRIDE)) + mov x6, x2 // backup of w + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b +1: + subs x2, x2, #8 + ld1 {v0.4s, v1.4s}, [x0] // a + ld1 {v2.8h}, [x1] // b + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v1.8b, v1.8b, v5.8b + add v1.8b, v1.8b, v25.8b + uxtl v1.8h, v1.8b // x + + umull v3.4s, v1.4h, v2.4h // x * BB[i] + umull2 v4.4s, v1.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v2.8h, v29.8h, v1.8h // 256 - x + + st1 {v3.4s, v4.4s}, [x0], #32 + st1 {v2.8h}, [x1], #16 + b.gt 1b + + subs x3, x3, #1 + b.le 0f + add x0, x0, x7, lsl #2 + add x1, x1, x7, lsl #1 + mov x2, x6 + b 1b +0: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S new file mode 100644 index 0000000000..7cdfd6f3f7 --- /dev/null +++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S @@ -0,0 +1,597 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +.macro sgr_funcs bpc +// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_\bpc\()bpc_neon, export=1 + sub x7, x3, #(4*SUM_STRIDE) + add x8, x3, #(4*SUM_STRIDE) + sub x9, x4, #(2*SUM_STRIDE) + add x10, x4, #(2*SUM_STRIDE) + mov x11, #SUM_STRIDE + mov x12, #FILTER_OUT_STRIDE + add x13, x5, #7 + bic x13, x13, #7 // Aligned width +.if \bpc == 8 + sub x2, x2, x13 +.else + sub x2, x2, x13, lsl #1 +.endif + sub x12, x12, x13 + sub x11, x11, x13 + sub x11, x11, #4 // We read 4 extra elements from a + sub x14, x11, #4 // We read 8 extra elements from b + mov x13, x5 + movi v6.8h, #3 + movi v7.4s, #3 +1: + ld1 {v0.8h, v1.8h}, [x9], #32 + ld1 {v2.8h, v3.8h}, [x4], #32 + ld1 {v4.8h, v5.8h}, [x10], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 + +2: + subs x5, x5, #8 + ext v25.16b, v0.16b, v1.16b, #2 // -stride + ext v26.16b, v2.16b, v3.16b, #2 // 0 + ext v27.16b, v4.16b, v5.16b, #2 // +stride + ext v28.16b, v0.16b, v1.16b, #4 // +1-stride + ext v29.16b, v2.16b, v3.16b, #4 // +1 + ext v30.16b, v4.16b, v5.16b, #4 // +1+stride + add v2.8h, v2.8h, v25.8h // -1, -stride + add v26.8h, v26.8h, v27.8h // 0, +stride + add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride + add v2.8h, v2.8h, v26.8h + add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride + add v2.8h, v2.8h, v29.8h // +1 + add v0.8h, v0.8h, v4.8h + + ext v25.16b, v16.16b, v17.16b, #4 // -stride + ext v26.16b, v17.16b, v18.16b, #4 + shl v2.8h, v2.8h, #2 + ext v27.16b, v16.16b, v17.16b, #8 // +1-stride + ext v28.16b, v17.16b, v18.16b, #8 + ext v29.16b, v19.16b, v20.16b, #4 // 0 + ext v30.16b, v20.16b, v21.16b, #4 + mla v2.8h, v0.8h, v6.8h // * 3 -> a + add v25.4s, v25.4s, v19.4s // -stride, -1 + add v26.4s, v26.4s, v20.4s + add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v28.4s + ext v27.16b, v19.16b, v20.16b, #8 // +1 + ext v28.16b, v20.16b, v21.16b, #8 + add v16.4s, v16.4s, v22.4s // -1+stride + add v17.4s, v17.4s, v23.4s + add v29.4s, v29.4s, v27.4s // 0, +1 + add v30.4s, v30.4s, v28.4s + add v25.4s, v25.4s, v29.4s + add v26.4s, v26.4s, v30.4s + ext v27.16b, v22.16b, v23.16b, #4 // +stride + ext v28.16b, v23.16b, v24.16b, #4 + ext v29.16b, v22.16b, v23.16b, #8 // +1+stride + ext v30.16b, v23.16b, v24.16b, #8 +.if \bpc == 8 + ld1 {v19.8b}, [x1], #8 // src +.else + ld1 {v19.8h}, [x1], #16 // src +.endif + add v25.4s, v25.4s, v27.4s // +stride + add v26.4s, v26.4s, v28.4s + add v16.4s, v16.4s, v29.4s // +1+stride + add v17.4s, v17.4s, v30.4s + shl v25.4s, v25.4s, #2 + shl v26.4s, v26.4s, #2 + mla v25.4s, v16.4s, v7.4s // * 3 -> b + mla v26.4s, v17.4s, v7.4s +.if \bpc == 8 + uxtl v19.8h, v19.8b // src +.endif + mov v0.16b, v1.16b + umlal v25.4s, v2.4h, v19.4h // b + a * src + umlal2 v26.4s, v2.8h, v19.8h + mov v2.16b, v3.16b + rshrn v25.4h, v25.4s, #9 + rshrn2 v25.8h, v26.4s, #9 + mov v4.16b, v5.16b + st1 {v25.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + mov v22.16b, v24.16b + ld1 {v1.8h}, [x9], #16 + ld1 {v3.8h}, [x4], #16 + ld1 {v5.8h}, [x10], #16 + ld1 {v17.4s, v18.4s}, [x7], #32 + ld1 {v20.4s, v21.4s}, [x3], #32 + ld1 {v23.4s, v24.4s}, [x8], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x13 + add x0, x0, x12, lsl #1 + add x1, x1, x2 + add x3, x3, x11, lsl #2 + add x7, x7, x11, lsl #2 + add x8, x8, x11, lsl #2 + add x4, x4, x14, lsl #1 + add x9, x9, x14, lsl #1 + add x10, x10, x14, lsl #1 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_\bpc\()bpc_neon, export=1 + add x7, x3, #(4*(SUM_STRIDE)) + sub x3, x3, #(4*(SUM_STRIDE)) + add x8, x4, #(2*(SUM_STRIDE)) + sub x4, x4, #(2*(SUM_STRIDE)) + mov x9, #(2*SUM_STRIDE) + mov x10, #FILTER_OUT_STRIDE + add x11, x5, #7 + bic x11, x11, #7 // Aligned width +.if \bpc == 8 + sub x2, x2, x11 +.else + sub x2, x2, x11, lsl #1 +.endif + sub x10, x10, x11 + sub x9, x9, x11 + sub x9, x9, #4 // We read 4 extra elements from a + sub x12, x9, #4 // We read 8 extra elements from b + mov x11, x5 + movi v4.8h, #5 + movi v5.4s, #5 + movi v6.8h, #6 + movi v7.4s, #6 +1: + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 + +2: + subs x5, x5, #8 + ext v24.16b, v0.16b, v1.16b, #4 // +1-stride + ext v25.16b, v2.16b, v3.16b, #4 // +1+stride + ext v22.16b, v0.16b, v1.16b, #2 // -stride + ext v23.16b, v2.16b, v3.16b, #2 // +stride + add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride + add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride + add v2.8h, v22.8h, v23.8h // -stride, +stride + add v0.8h, v0.8h, v25.8h + + ext v22.16b, v16.16b, v17.16b, #4 // -stride + ext v23.16b, v17.16b, v18.16b, #4 + ext v24.16b, v19.16b, v20.16b, #4 // +stride + ext v25.16b, v20.16b, v21.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1-stride + ext v27.16b, v17.16b, v18.16b, #8 + ext v28.16b, v19.16b, v20.16b, #8 // +1+stride + ext v29.16b, v20.16b, v21.16b, #8 + mul v0.8h, v0.8h, v4.8h // * 5 + mla v0.8h, v2.8h, v6.8h // * 6 +.if \bpc == 8 + ld1 {v31.8b}, [x1], #8 +.else + ld1 {v31.8h}, [x1], #16 +.endif + add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v27.4s + add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride + add v20.4s, v20.4s, v29.4s + add v16.4s, v16.4s, v19.4s + add v17.4s, v17.4s, v20.4s + + add v22.4s, v22.4s, v24.4s // -stride, +stride + add v23.4s, v23.4s, v25.4s + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v16.4s, v16.4s, v5.4s // * 5 + mla v16.4s, v22.4s, v7.4s // * 6 + mul v17.4s, v17.4s, v5.4s // * 5 + mla v17.4s, v23.4s, v7.4s // * 6 + +.if \bpc == 8 + uxtl v31.8h, v31.8b +.endif + umlal v16.4s, v0.4h, v31.4h // b + a * src + umlal2 v17.4s, v0.8h, v31.8h + mov v0.16b, v1.16b + rshrn v16.4h, v16.4s, #9 + rshrn2 v16.8h, v17.4s, #9 + mov v2.16b, v3.16b + st1 {v16.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + ld1 {v1.8h}, [x4], #16 + ld1 {v3.8h}, [x8], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + ld1 {v20.4s, v21.4s}, [x7], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + add x3, x3, x9, lsl #2 + add x7, x7, x9, lsl #2 + add x4, x4, x12, lsl #1 + add x8, x8, x12, lsl #1 + mov x13, x3 + mov x14, x4 + + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + +4: + subs x5, x5, #8 + ext v23.16b, v0.16b, v1.16b, #4 // +1 + ext v22.16b, v0.16b, v1.16b, #2 // 0 + add v0.8h, v0.8h, v23.8h // -1, +1 + + ext v24.16b, v16.16b, v17.16b, #4 // 0 + ext v25.16b, v17.16b, v18.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1 + ext v27.16b, v17.16b, v18.16b, #8 + mul v2.8h, v22.8h, v6.8h // * 6 + mla v2.8h, v0.8h, v4.8h // * 5 -> a +.if \bpc == 8 + ld1 {v31.8b}, [x1], #8 +.else + ld1 {v31.8h}, [x1], #16 +.endif + add v16.4s, v16.4s, v26.4s // -1, +1 + add v17.4s, v17.4s, v27.4s +.if \bpc == 8 + uxtl v31.8h, v31.8b +.endif + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v24.4s, v24.4s, v7.4s // * 6 + mla v24.4s, v16.4s, v5.4s // * 5 -> b + mul v25.4s, v25.4s, v7.4s // * 6 + mla v25.4s, v17.4s, v5.4s // * 5 -> b + + umlal v24.4s, v2.4h, v31.4h // b + a * src + umlal2 v25.4s, v2.8h, v31.8h + mov v0.16b, v1.16b + rshrn v24.4h, v24.4s, #8 + rshrn2 v24.8h, v25.4s, #8 + mov v16.16b, v18.16b + st1 {v24.8h}, [x0], #16 + + b.le 5f + ld1 {v1.8h}, [x4], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + b 4b + +5: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + mov x3, x13 // Rewind x3/x4 to where they started + mov x4, x14 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt, const int bitdepth_max); +function sgr_weighted1_\bpc\()bpc_neon, export=1 +.if \bpc == 16 + ldr w8, [sp] +.endif + dup v31.8h, w7 + cmp x6, #2 +.if \bpc == 16 + dup v30.8h, w8 +.endif + add x9, x0, x1 + add x10, x2, x3 + add x11, x4, #2*FILTER_OUT_STRIDE + mov x7, #(4*FILTER_OUT_STRIDE) + lsl x1, x1, #1 + lsl x3, x3, #1 + add x8, x5, #7 + bic x8, x8, #7 // Aligned width +.if \bpc == 8 + sub x1, x1, x8 + sub x3, x3, x8 +.else + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 +.endif + sub x7, x7, x8, lsl #1 + mov x8, x5 + b.lt 2f +1: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 + ld1 {v4.8b}, [x10], #8 +.else + ld1 {v0.8h}, [x2], #16 + ld1 {v4.8h}, [x10], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v5.8h}, [x11], #16 + subs x5, x5, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u + ushll v4.8h, v4.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u + shl v4.8h, v4.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v5.8h, v5.8h, v4.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + ushll v6.4s, v4.4h, #7 // u << 7 + ushll2 v7.4s, v4.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v + smlal v6.4s, v5.4h, v31.4h // v + smlal2 v7.4s, v5.8h, v31.8h // v +.if \bpc == 8 + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + rshrn v6.4h, v6.4s, #11 + rshrn2 v6.8h, v7.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun v6.8b, v6.8h + st1 {v2.8b}, [x0], #8 + st1 {v6.8b}, [x9], #8 +.else + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v6.4h, v6.4s, #11 + sqrshrun2 v6.8h, v7.4s, #11 + umin v2.8h, v2.8h, v30.8h + umin v6.8h, v6.8h, v30.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x9], #16 +.endif + b.gt 1b + + sub x6, x6, #2 + cmp x6, #1 + b.lt 0f + mov x5, x8 + add x0, x0, x1 + add x9, x9, x1 + add x2, x2, x3 + add x10, x10, x3 + add x4, x4, x7 + add x11, x11, x7 + b.eq 2f + b 1b + +2: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 +.else + ld1 {v0.8h}, [x2], #16 +.endif + ld1 {v1.8h}, [x4], #16 + subs x5, x5, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v +.if \bpc == 8 + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + sqxtun v2.8b, v2.8h + st1 {v2.8b}, [x0], #8 +.else + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + umin v2.8h, v2.8h, v30.8h + st1 {v2.8h}, [x0], #16 +.endif + b.gt 2b +0: + ret +endfunc + +// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2], const int bitdepth_max); +function sgr_weighted2_\bpc\()bpc_neon, export=1 +.if \bpc == 8 + ldr x8, [sp] +.else + ldp x8, x9, [sp] +.endif + cmp x7, #2 + add x10, x0, x1 + add x11, x2, x3 + add x12, x4, #2*FILTER_OUT_STRIDE + add x13, x5, #2*FILTER_OUT_STRIDE + ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] +.if \bpc == 16 + dup v29.8h, w9 +.endif + mov x8, #4*FILTER_OUT_STRIDE + lsl x1, x1, #1 + lsl x3, x3, #1 + add x9, x6, #7 + bic x9, x9, #7 // Aligned width +.if \bpc == 8 + sub x1, x1, x9 + sub x3, x3, x9 +.else + sub x1, x1, x9, lsl #1 + sub x3, x3, x9, lsl #1 +.endif + sub x8, x8, x9, lsl #1 + mov x9, x6 + b.lt 2f +1: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 + ld1 {v16.8b}, [x11], #8 +.else + ld1 {v0.8h}, [x2], #16 + ld1 {v16.8h}, [x11], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v17.8h}, [x12], #16 + ld1 {v2.8h}, [x5], #16 + ld1 {v18.8h}, [x13], #16 + subs x6, x6, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u + ushll v16.8h, v16.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u + shl v16.8h, v16.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + sub v17.8h, v17.8h, v16.8h // t1 - u + sub v18.8h, v18.8h, v16.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + ushll v19.4s, v16.4h, #7 // u << 7 + ushll2 v20.4s, v16.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) + smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) + smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) +.if \bpc == 8 + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + rshrn v19.4h, v19.4s, #11 + rshrn2 v19.8h, v20.4s, #11 + sqxtun v3.8b, v3.8h + sqxtun v19.8b, v19.8h + st1 {v3.8b}, [x0], #8 + st1 {v19.8b}, [x10], #8 +.else + sqrshrun v3.4h, v3.4s, #11 + sqrshrun2 v3.8h, v4.4s, #11 + sqrshrun v19.4h, v19.4s, #11 + sqrshrun2 v19.8h, v20.4s, #11 + umin v3.8h, v3.8h, v29.8h + umin v19.8h, v19.8h, v29.8h + st1 {v3.8h}, [x0], #16 + st1 {v19.8h}, [x10], #16 +.endif + b.gt 1b + + subs x7, x7, #2 + cmp x7, #1 + b.lt 0f + mov x6, x9 + add x0, x0, x1 + add x10, x10, x1 + add x2, x2, x3 + add x11, x11, x3 + add x4, x4, x8 + add x12, x12, x8 + add x5, x5, x8 + add x13, x13, x8 + b.eq 2f + b 1b + +2: +.if \bpc == 8 + ld1 {v0.8b}, [x2], #8 +.else + ld1 {v0.8h}, [x2], #16 +.endif + ld1 {v1.8h}, [x4], #16 + ld1 {v2.8h}, [x5], #16 + subs x6, x6, #8 +.if \bpc == 8 + ushll v0.8h, v0.8b, #4 // u +.else + shl v0.8h, v0.8h, #4 // u +.endif + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) +.if \bpc == 8 + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + sqxtun v3.8b, v3.8h + st1 {v3.8b}, [x0], #8 +.else + sqrshrun v3.4h, v3.4s, #11 + sqrshrun2 v3.8h, v4.4s, #11 + umin v3.8h, v3.8h, v29.8h + st1 {v3.8h}, [x0], #16 +.endif + b.gt 1b +0: + ret +endfunc +.endm diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S new file mode 100644 index 0000000000..9f7b4e7a89 --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc.S @@ -0,0 +1,3310 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro avg dst, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + add \t0\().8h, \t0\().8h, \t2\().8h + add \t1\().8h, \t1\().8h, \t3\().8h + sqrshrun \dst\().8b, \t0\().8h, #5 + sqrshrun2 \dst\().16b, \t1\().8h, #5 +.endm + +.macro w_avg dst, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sub \t0\().8h, \t2\().8h, \t0\().8h + sub \t1\().8h, \t3\().8h, \t1\().8h + sqdmulh \t0\().8h, \t0\().8h, v30.8h + sqdmulh \t1\().8h, \t1\().8h, v30.8h + add \t0\().8h, \t2\().8h, \t0\().8h + add \t1\().8h, \t3\().8h, \t1\().8h + sqrshrun \dst\().8b, \t0\().8h, #4 + sqrshrun2 \dst\().16b, \t1\().8h, #4 +.endm + +.macro mask dst, t0, t1, t2, t3 + ld1 {v30.16b}, [x6], 16 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + mul v30.16b, v30.16b, v31.16b + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + shll v28.8h, v30.8b, #8 + shll2 v29.8h, v30.16b, #8 + sub \t0\().8h, \t2\().8h, \t0\().8h + sub \t1\().8h, \t3\().8h, \t1\().8h + sqdmulh \t0\().8h, \t0\().8h, v28.8h + sqdmulh \t1\().8h, \t1\().8h, v29.8h + add \t0\().8h, \t2\().8h, \t0\().8h + add \t1\().8h, \t3\().8h, \t1\().8h + sqrshrun \dst\().8b, \t0\().8h, #4 + sqrshrun2 \dst\().16b, \t1\().8h, #4 +.endm + +.macro bidir_fn type +function \type\()_8bpc_neon, export=1 + clz w4, w4 +.ifc \type, w_avg + dup v30.8h, w6 + neg v30.8h, v30.8h + shl v30.8h, v30.8h, #11 +.endif +.ifc \type, mask + movi v31.16b, #256-2 +.endif + adr x7, L(\type\()_tbl) + sub w4, w4, #24 + ldrh w4, [x7, x4, lsl #1] + \type v4, v0, v1, v2, v3 + sub x7, x7, w4, uxtw + br x7 +40: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +4: + cmp w5, #4 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x7], x1 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x7], x1 + b.eq 0f + \type v5, v0, v1, v2, v3 + cmp w5, #8 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x7], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x7], x1 + b.eq 0f + \type v4, v0, v1, v2, v3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x7], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x7], x1 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x7], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x7], x1 + ret +80: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +8: + st1 {v4.d}[0], [x0], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.d}[1], [x7], x1 + st1 {v5.d}[0], [x0], x1 + subs w5, w5, #4 + st1 {v5.d}[1], [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 8b +16: + AARCH64_VALID_JUMP_TARGET + \type v5, v0, v1, v2, v3 + st1 {v4.16b}, [x0], x1 + \type v6, v0, v1, v2, v3 + st1 {v5.16b}, [x0], x1 + \type v7, v0, v1, v2, v3 + st1 {v6.16b}, [x0], x1 + subs w5, w5, #4 + st1 {v7.16b}, [x0], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 16b +320: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +32: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + st1 {v4.16b,v5.16b}, [x0], x1 + \type v7, v0, v1, v2, v3 + subs w5, w5, #2 + st1 {v6.16b,v7.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 32b +640: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +64: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 + st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 + subs w5, w5, #2 + st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 64b +1280: + AARCH64_VALID_JUMP_TARGET + add x7, x0, #64 +128: + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 + st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 + b.le 0f + \type v4, v0, v1, v2, v3 + b 128b +0: + ret +L(\type\()_tbl): + .hword L(\type\()_tbl) - 1280b + .hword L(\type\()_tbl) - 640b + .hword L(\type\()_tbl) - 320b + .hword L(\type\()_tbl) - 16b + .hword L(\type\()_tbl) - 80b + .hword L(\type\()_tbl) - 40b +endfunc +.endm + +bidir_fn avg +bidir_fn w_avg +bidir_fn mask + + +.macro w_mask_fn type +function w_mask_\type\()_8bpc_neon, export=1 + clz w8, w4 + adr x9, L(w_mask_\type\()_tbl) + sub w8, w8, #24 + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + mov w10, #6903 + dup v0.8h, w10 +.if \type == 444 + movi v1.16b, #64 +.elseif \type == 422 + dup v2.8b, w7 + movi v3.8b, #129 + sub v3.8b, v3.8b, v2.8b +.elseif \type == 420 + dup v2.8h, w7 + movi v3.8h, #1, lsl #8 + sub v3.8h, v3.8h, v2.8h +.endif + add x12, x0, x1 + lsl x1, x1, #1 + br x9 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) + subs w5, w5, #4 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + trn1 v24.2d, v18.2d, v19.2d + trn2 v25.2d, v18.2d, v19.2d + add v24.8h, v24.8h, v25.8h + addp v18.8h, v24.8h, v24.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.s}[0], [x0], x1 + st1 {v22.s}[1], [x12], x1 + st1 {v23.s}[0], [x0], x1 + st1 {v23.s}[1], [x12], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + subs w5, w5, #2 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + add v18.8h, v18.8h, v19.8h + addp v18.8h, v18.8h, v18.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x12], x1 + b.gt 8b + ret +1280: +640: +320: +160: + AARCH64_VALID_JUMP_TARGET + mov w11, w4 + sub x1, x1, w4, uxtw +.if \type == 444 + add x10, x6, w4, uxtw +.elseif \type == 422 + add x10, x6, x11, lsr #1 +.endif + add x9, x3, w4, uxtw #1 + add x7, x2, w4, uxtw #1 +161: + mov w8, w4 +16: + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x9], #32 + subs w8, w8, #16 + sub v6.8h, v6.8h, v4.8h + sub v7.8h, v7.8h, v5.8h + sub v18.8h, v18.8h, v16.8h + sub v19.8h, v19.8h, v17.8h + abs v20.8h, v6.8h + abs v21.8h, v7.8h + abs v22.8h, v18.8h + abs v23.8h, v19.8h + uqsub v20.8h, v0.8h, v20.8h + uqsub v21.8h, v0.8h, v21.8h + uqsub v22.8h, v0.8h, v22.8h + uqsub v23.8h, v0.8h, v23.8h + ushr v20.8h, v20.8h, #8 + ushr v21.8h, v21.8h, #8 + ushr v22.8h, v22.8h, #8 + ushr v23.8h, v23.8h, #8 + shl v24.8h, v20.8h, #9 + shl v25.8h, v21.8h, #9 + shl v26.8h, v22.8h, #9 + shl v27.8h, v23.8h, #9 + sqdmulh v24.8h, v24.8h, v6.8h + sqdmulh v25.8h, v25.8h, v7.8h + sqdmulh v26.8h, v26.8h, v18.8h + sqdmulh v27.8h, v27.8h, v19.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v16.8h + add v27.8h, v27.8h, v17.8h + sqrshrun v24.8b, v24.8h, #4 + sqrshrun v25.8b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun v27.8b, v27.8h, #4 +.if \type == 444 + uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 + uzp1 v21.16b, v22.16b, v23.16b // Ditto + sub v20.16b, v1.16b, v20.16b + sub v21.16b, v1.16b, v21.16b + st1 {v20.16b}, [x6], #16 + st1 {v21.16b}, [x10], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h + addp v21.8h, v22.8h, v23.8h + xtn v20.8b, v20.8h + xtn v21.8b, v21.8h + uhsub v20.8b, v3.8b, v20.8b + uhsub v21.8b, v3.8b, v21.8b + st1 {v20.8b}, [x6], #8 + st1 {v21.8b}, [x10], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v22.8h + add v21.8h, v21.8h, v23.8h + addp v20.8h, v20.8h, v21.8h + sub v20.8h, v3.8h, v20.8h + rshrn v20.8b, v20.8h, #2 + st1 {v20.8b}, [x6], #8 +.endif + st1 {v24.8b, v25.8b}, [x0], #16 + st1 {v26.8b, v27.8b}, [x12], #16 + b.gt 16b + subs w5, w5, #2 + add x2, x2, w4, uxtw #1 + add x3, x3, w4, uxtw #1 + add x7, x7, w4, uxtw #1 + add x9, x9, w4, uxtw #1 +.if \type == 444 + add x6, x6, w4, uxtw + add x10, x10, w4, uxtw +.elseif \type == 422 + add x6, x6, x11, lsr #1 + add x10, x10, x11, lsr #1 +.endif + add x0, x0, x1 + add x12, x12, x1 + b.gt 161b + ret +L(w_mask_\type\()_tbl): + .hword L(w_mask_\type\()_tbl) - 1280b + .hword L(w_mask_\type\()_tbl) - 640b + .hword L(w_mask_\type\()_tbl) - 320b + .hword L(w_mask_\type\()_tbl) - 160b + .hword L(w_mask_\type\()_tbl) - 8b + .hword L(w_mask_\type\()_tbl) - 4b +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_8bpc_neon, export=1 + adr x6, L(blend_tbl) + clz w3, w3 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + br x6 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8b}, [x5], #8 + ld1 {v1.d}[0], [x2], #8 + ld1 {v0.s}[0], [x0] + subs w4, w4, #2 + ld1 {v0.s}[1], [x8] + sub v3.8b, v4.8b, v2.8b + umull v5.8h, v1.8b, v2.8b + umlal v5.8h, v0.8b, v3.8b + rshrn v6.8b, v5.8h, #6 + st1 {v6.s}[0], [x0], x1 + st1 {v6.s}[1], [x8], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.16b}, [x5], #16 + ld1 {v1.16b}, [x2], #16 + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + sub v3.16b, v4.16b, v2.16b + subs w4, w4, #2 + umull v5.8h, v1.8b, v2.8b + umlal v5.8h, v0.8b, v3.8b + umull2 v6.8h, v1.16b, v2.16b + umlal2 v6.8h, v0.16b, v3.16b + rshrn v7.8b, v5.8h, #6 + rshrn2 v7.16b, v6.8h, #6 + st1 {v7.d}[0], [x0], x1 + st1 {v7.d}[1], [x8], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld1 {v1.16b, v2.16b}, [x5], #32 + ld1 {v5.16b, v6.16b}, [x2], #32 + ld1 {v0.16b}, [x0] + subs w4, w4, #2 + sub v7.16b, v4.16b, v1.16b + sub v20.16b, v4.16b, v2.16b + ld1 {v3.16b}, [x8] + umull v16.8h, v5.8b, v1.8b + umlal v16.8h, v0.8b, v7.8b + umull2 v17.8h, v5.16b, v1.16b + umlal2 v17.8h, v0.16b, v7.16b + umull v21.8h, v6.8b, v2.8b + umlal v21.8h, v3.8b, v20.8b + umull2 v22.8h, v6.16b, v2.16b + umlal2 v22.8h, v3.16b, v20.16b + rshrn v18.8b, v16.8h, #6 + rshrn2 v18.16b, v17.8h, #6 + rshrn v19.8b, v21.8h, #6 + rshrn2 v19.16b, v22.8h, #6 + st1 {v18.16b}, [x0], x1 + st1 {v19.16b}, [x8], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 + ld1 {v20.16b, v21.16b}, [x0] + subs w4, w4, #2 + ld1 {v22.16b, v23.16b}, [x8] + sub v5.16b, v4.16b, v0.16b + sub v6.16b, v4.16b, v1.16b + sub v30.16b, v4.16b, v2.16b + sub v31.16b, v4.16b, v3.16b + umull v24.8h, v16.8b, v0.8b + umlal v24.8h, v20.8b, v5.8b + umull2 v26.8h, v16.16b, v0.16b + umlal2 v26.8h, v20.16b, v5.16b + umull v28.8h, v17.8b, v1.8b + umlal v28.8h, v21.8b, v6.8b + umull2 v7.8h, v17.16b, v1.16b + umlal2 v7.8h, v21.16b, v6.16b + umull v27.8h, v18.8b, v2.8b + umlal v27.8h, v22.8b, v30.8b + umull2 v1.8h, v18.16b, v2.16b + umlal2 v1.8h, v22.16b, v30.16b + umull v29.8h, v19.8b, v3.8b + umlal v29.8h, v23.8b, v31.8b + umull2 v21.8h, v19.16b, v3.16b + umlal2 v21.8h, v23.16b, v31.16b + rshrn v24.8b, v24.8h, #6 + rshrn2 v24.16b, v26.8h, #6 + rshrn v25.8b, v28.8h, #6 + rshrn2 v25.16b, v7.8h, #6 + rshrn v27.8b, v27.8h, #6 + rshrn2 v27.16b, v1.8h, #6 + rshrn v28.8b, v29.8h, #6 + rshrn2 v28.16b, v21.8h, #6 + st1 {v24.16b, v25.16b}, [x0], x1 + st1 {v27.16b, v28.16b}, [x8], x1 + b.gt 32b + ret +L(blend_tbl): + .hword L(blend_tbl) - 32b + .hword L(blend_tbl) - 16b + .hword L(blend_tbl) - 8b + .hword L(blend_tbl) - 4b +endfunc + +function blend_h_8bpc_neon, export=1 + adr x6, L(blend_h_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w4, uxtw + sub w4, w4, w4, lsr #2 + clz w7, w3 + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + sub w7, w7, #24 + ldrh w7, [x6, x7, lsl #1] + sub x6, x6, w7, uxtw + br x6 +2: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.h}[0], [x5], #2 + ld1 {v1.s}[0], [x2], #4 + subs w4, w4, #2 + ld1 {v2.h}[0], [x0] + zip1 v0.8b, v0.8b, v0.8b + sub v3.8b, v4.8b, v0.8b + ld1 {v2.h}[1], [x8] + umull v5.8h, v1.8b, v0.8b + umlal v5.8h, v2.8b, v3.8b + rshrn v5.8b, v5.8h, #6 + st1 {v5.h}[0], [x0], x1 + st1 {v5.h}[1], [x8], x1 + b.gt 2b + ret +4: + AARCH64_VALID_JUMP_TARGET + ld2r {v0.8b, v1.8b}, [x5], #2 + ld1 {v2.8b}, [x2], #8 + subs w4, w4, #2 + ext v0.8b, v0.8b, v1.8b, #4 + ld1 {v3.s}[0], [x0] + sub v5.8b, v4.8b, v0.8b + ld1 {v3.s}[1], [x8] + umull v6.8h, v2.8b, v0.8b + umlal v6.8h, v3.8b, v5.8b + rshrn v6.8b, v6.8h, #6 + st1 {v6.s}[0], [x0], x1 + st1 {v6.s}[1], [x8], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld2r {v0.16b, v1.16b}, [x5], #2 + ld1 {v2.16b}, [x2], #16 + ld1 {v3.d}[0], [x0] + ext v0.16b, v0.16b, v1.16b, #8 + sub v5.16b, v4.16b, v0.16b + ld1 {v3.d}[1], [x8] + subs w4, w4, #2 + umull v6.8h, v0.8b, v2.8b + umlal v6.8h, v3.8b, v5.8b + umull2 v7.8h, v0.16b, v2.16b + umlal2 v7.8h, v3.16b, v5.16b + rshrn v16.8b, v6.8h, #6 + rshrn2 v16.16b, v7.8h, #6 + st1 {v16.d}[0], [x0], x1 + st1 {v16.d}[1], [x8], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld2r {v0.16b, v1.16b}, [x5], #2 + ld1 {v2.16b, v3.16b}, [x2], #32 + ld1 {v5.16b}, [x0] + sub v7.16b, v4.16b, v0.16b + sub v16.16b, v4.16b, v1.16b + ld1 {v6.16b}, [x8] + subs w4, w4, #2 + umull v17.8h, v0.8b, v2.8b + umlal v17.8h, v5.8b, v7.8b + umull2 v18.8h, v0.16b, v2.16b + umlal2 v18.8h, v5.16b, v7.16b + umull v19.8h, v1.8b, v3.8b + umlal v19.8h, v6.8b, v16.8b + umull2 v20.8h, v1.16b, v3.16b + umlal2 v20.8h, v6.16b, v16.16b + rshrn v21.8b, v17.8h, #6 + rshrn2 v21.16b, v18.8h, #6 + rshrn v22.8b, v19.8h, #6 + rshrn2 v22.16b, v20.8h, #6 + st1 {v21.16b}, [x0], x1 + st1 {v22.16b}, [x8], x1 + b.gt 16b + ret +1280: +640: +320: + AARCH64_VALID_JUMP_TARGET + sub x1, x1, w3, uxtw + add x7, x2, w3, uxtw +321: + ld2r {v0.16b, v1.16b}, [x5], #2 + mov w6, w3 + sub v20.16b, v4.16b, v0.16b + sub v21.16b, v4.16b, v1.16b +32: + ld1 {v16.16b, v17.16b}, [x2], #32 + ld1 {v2.16b, v3.16b}, [x0] + subs w6, w6, #32 + umull v23.8h, v0.8b, v16.8b + umlal v23.8h, v2.8b, v20.8b + ld1 {v18.16b, v19.16b}, [x7], #32 + umull2 v27.8h, v0.16b, v16.16b + umlal2 v27.8h, v2.16b, v20.16b + ld1 {v6.16b, v7.16b}, [x8] + umull v24.8h, v0.8b, v17.8b + umlal v24.8h, v3.8b, v20.8b + umull2 v28.8h, v0.16b, v17.16b + umlal2 v28.8h, v3.16b, v20.16b + umull v25.8h, v1.8b, v18.8b + umlal v25.8h, v6.8b, v21.8b + umull2 v5.8h, v1.16b, v18.16b + umlal2 v5.8h, v6.16b, v21.16b + rshrn v29.8b, v23.8h, #6 + rshrn2 v29.16b, v27.8h, #6 + umull v26.8h, v1.8b, v19.8b + umlal v26.8h, v7.8b, v21.8b + umull2 v31.8h, v1.16b, v19.16b + umlal2 v31.8h, v7.16b, v21.16b + rshrn v30.8b, v24.8h, #6 + rshrn2 v30.16b, v28.8h, #6 + rshrn v23.8b, v25.8h, #6 + rshrn2 v23.16b, v5.8h, #6 + rshrn v24.8b, v26.8h, #6 + st1 {v29.16b, v30.16b}, [x0], #32 + rshrn2 v24.16b, v31.8h, #6 + st1 {v23.16b, v24.16b}, [x8], #32 + b.gt 32b + subs w4, w4, #2 + add x0, x0, x1 + add x8, x8, x1 + add x2, x2, w3, uxtw + add x7, x7, w3, uxtw + b.gt 321b + ret +L(blend_h_tbl): + .hword L(blend_h_tbl) - 1280b + .hword L(blend_h_tbl) - 640b + .hword L(blend_h_tbl) - 320b + .hword L(blend_h_tbl) - 16b + .hword L(blend_h_tbl) - 8b + .hword L(blend_h_tbl) - 4b + .hword L(blend_h_tbl) - 2b +endfunc + +function blend_v_8bpc_neon, export=1 + adr x6, L(blend_v_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w3, uxtw + clz w3, w3 + movi v4.16b, #64 + add x8, x0, x1 + lsl x1, x1, #1 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + br x6 +20: + AARCH64_VALID_JUMP_TARGET + ld1r {v0.8b}, [x5] + sub v1.8b, v4.8b, v0.8b +2: + ld1 {v2.h}[0], [x2], #2 + ld1 {v3.b}[0], [x0] + subs w4, w4, #2 + ld1 {v2.b}[1], [x2] + ld1 {v3.b}[1], [x8] + umull v5.8h, v2.8b, v0.8b + umlal v5.8h, v3.8b, v1.8b + rshrn v5.8b, v5.8h, #6 + add x2, x2, #2 + st1 {v5.b}[0], [x0], x1 + st1 {v5.b}[1], [x8], x1 + b.gt 2b + ret +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2s}, [x5] + sub x1, x1, #2 + sub v1.8b, v4.8b, v0.8b +4: + ld1 {v2.8b}, [x2], #8 + ld1 {v3.s}[0], [x0] + ld1 {v3.s}[1], [x8] + subs w4, w4, #2 + umull v5.8h, v2.8b, v0.8b + umlal v5.8h, v3.8b, v1.8b + rshrn v5.8b, v5.8h, #6 + st1 {v5.h}[0], [x0], #2 + st1 {v5.h}[2], [x8], #2 + st1 {v5.b}[2], [x0], x1 + st1 {v5.b}[6], [x8], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1r {v0.2d}, [x5] + sub x1, x1, #4 + sub v1.16b, v4.16b, v0.16b +8: + ld1 {v2.16b}, [x2], #16 + ld1 {v3.d}[0], [x0] + ld1 {v3.d}[1], [x8] + subs w4, w4, #2 + umull v5.8h, v0.8b, v2.8b + umlal v5.8h, v3.8b, v1.8b + umull2 v6.8h, v0.16b, v2.16b + umlal2 v6.8h, v3.16b, v1.16b + rshrn v7.8b, v5.8h, #6 + rshrn2 v7.16b, v6.8h, #6 + st1 {v7.s}[0], [x0], #4 + st1 {v7.s}[2], [x8], #4 + st1 {v7.h}[2], [x0], x1 + st1 {v7.h}[6], [x8], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b}, [x5] + sub x1, x1, #8 + sub v2.16b, v4.16b, v0.16b +16: + ld1 {v5.16b, v6.16b}, [x2], #32 + ld1 {v7.16b}, [x0] + subs w4, w4, #2 + ld1 {v16.16b}, [x8] + umull v17.8h, v5.8b, v0.8b + umlal v17.8h, v7.8b, v2.8b + umull2 v18.8h, v5.16b, v0.16b + umlal2 v18.8h, v7.16b, v2.16b + umull v20.8h, v6.8b, v0.8b + umlal v20.8h, v16.8b, v2.8b + umull2 v21.8h, v6.16b, v0.16b + umlal2 v21.8h, v16.16b, v2.16b + rshrn v19.8b, v17.8h, #6 + rshrn2 v19.16b, v18.8h, #6 + rshrn v22.8b, v20.8h, #6 + rshrn2 v22.16b, v21.8h, #6 + st1 {v19.8b}, [x0], #8 + st1 {v22.8b}, [x8], #8 + st1 {v19.s}[2], [x0], x1 + st1 {v22.s}[2], [x8], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.16b, v1.16b}, [x5] + sub x1, x1, #16 + sub v2.16b, v4.16b, v0.16b + sub v3.8b, v4.8b, v1.8b +32: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 + ld1 {v5.16b, v6.16b}, [x0] + subs w4, w4, #2 + ld1 {v20.16b, v21.16b}, [x8] + umull v22.8h, v16.8b, v0.8b + umlal v22.8h, v5.8b, v2.8b + umull2 v23.8h, v16.16b, v0.16b + umlal2 v23.8h, v5.16b, v2.16b + umull v28.8h, v17.8b, v1.8b + umlal v28.8h, v6.8b, v3.8b + umull v30.8h, v18.8b, v0.8b + umlal v30.8h, v20.8b, v2.8b + umull2 v31.8h, v18.16b, v0.16b + umlal2 v31.8h, v20.16b, v2.16b + umull v25.8h, v19.8b, v1.8b + umlal v25.8h, v21.8b, v3.8b + rshrn v24.8b, v22.8h, #6 + rshrn2 v24.16b, v23.8h, #6 + rshrn v28.8b, v28.8h, #6 + rshrn v30.8b, v30.8h, #6 + rshrn2 v30.16b, v31.8h, #6 + rshrn v27.8b, v25.8h, #6 + st1 {v24.16b}, [x0], #16 + st1 {v30.16b}, [x8], #16 + st1 {v28.8b}, [x0], x1 + st1 {v27.8b}, [x8], x1 + b.gt 32b + ret +L(blend_v_tbl): + .hword L(blend_v_tbl) - 320b + .hword L(blend_v_tbl) - 160b + .hword L(blend_v_tbl) - 80b + .hword L(blend_v_tbl) - 40b + .hword L(blend_v_tbl) - 20b +endfunc + + +// This has got the same signature as the put_8tap functions, +// and assumes that x8 is set to (clz(w)-24). +function put_neon + adr x9, L(put_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +2: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.h}[0], [x2], x3 + ld1 {v1.h}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.h}[0], [x0], x1 + st1 {v1.h}[0], [x0], x1 + b.gt 2b + ret +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + subs w5, w5, #2 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + add x8, x0, x1 + lsl x1, x1, #1 + add x9, x2, x3 + lsl x3, x3, #1 +16: + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x9], x3 + subs w5, w5, #2 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x8], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + subs w5, w5, #1 + stp x8, x9, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + ldp x10, x11, [x2, #32] + stp x8, x9, [x0, #16] + subs w5, w5, #1 + ldp x12, x13, [x2, #48] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.gt 64b + ret +128: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x2, x2, x3 + add x0, x0, x1 + b.gt 128b + ret + +L(put_tbl): + .hword L(put_tbl) - 128b + .hword L(put_tbl) - 64b + .hword L(put_tbl) - 32b + .hword L(put_tbl) - 160b + .hword L(put_tbl) - 8b + .hword L(put_tbl) - 4b + .hword L(put_tbl) - 2b +endfunc + + +// This has got the same signature as the prep_8tap functions, +// and assumes that x8 is set to (clz(w)-24), and x7 to w*2. +function prep_neon + adr x9, L(prep_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x1], x2 + ld1 {v1.s}[0], [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.4h, v1.4h}, [x0], #16 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + add x9, x1, x2 + lsl x2, x2, #1 +16: + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x9], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + add x8, x0, w3, uxtw +32: + ld1 {v0.16b, v1.16b}, [x1], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ld1 {v2.16b, v3.16b}, [x1], x2 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x7 + ushll2 v17.8h, v2.16b, #4 + st1 {v6.8h, v7.8h}, [x8], x7 + ushll v18.8h, v3.8b, #4 + st1 {v16.8h, v17.8h}, [x0], x7 + ushll2 v19.8h, v3.16b, #4 + st1 {v18.8h, v19.8h}, [x8], x7 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + add x8, x0, #32 + mov x6, #64 +64: + ldp q0, q1, [x1] + subs w4, w4, #1 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ldp q2, q3, [x1, #32] + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + add x1, x1, x2 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x6 + ushll2 v17.8h, v2.16b, #4 + ushll v18.8h, v3.8b, #4 + st1 {v6.8h, v7.8h}, [x8], x6 + ushll2 v19.8h, v3.16b, #4 + st1 {v16.8h, v17.8h}, [x0], x6 + st1 {v18.8h, v19.8h}, [x8], x6 + b.gt 64b + ret +1280: + AARCH64_VALID_JUMP_TARGET + add x8, x0, #64 + mov x6, #128 +128: + ldp q0, q1, [x1] + ldp q2, q3, [x1, #32] + ushll v16.8h, v0.8b, #4 + ushll2 v17.8h, v0.16b, #4 + ushll v18.8h, v1.8b, #4 + ushll2 v19.8h, v1.16b, #4 + ushll v20.8h, v2.8b, #4 + ushll2 v21.8h, v2.16b, #4 + ldp q4, q5, [x1, #64] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 + ushll v22.8h, v3.8b, #4 + ushll2 v23.8h, v3.16b, #4 + ushll v24.8h, v4.8b, #4 + ushll2 v25.8h, v4.16b, #4 + ushll v26.8h, v5.8b, #4 + ushll2 v27.8h, v5.16b, #4 + ldp q6, q7, [x1, #96] + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 + ushll v28.8h, v6.8b, #4 + ushll2 v29.8h, v6.16b, #4 + ushll v30.8h, v7.8b, #4 + ushll2 v31.8h, v7.16b, #4 + subs w4, w4, #1 + add x1, x1, x2 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 + b.gt 128b + ret + +L(prep_tbl): + .hword L(prep_tbl) - 1280b + .hword L(prep_tbl) - 640b + .hword L(prep_tbl) - 320b + .hword L(prep_tbl) - 160b + .hword L(prep_tbl) - 8b + .hword L(prep_tbl) - 4b +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}[0], [\s0], \strd + ld1 {\d1\wd}[0], [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}[0], [\s0], \strd + ld1 {\d3\wd}[0], [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}[0], [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}[0], [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}[0], [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}, [\s0], \strd + ld1 {\d1\wd}, [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}, [\s0], \strd + ld1 {\d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}, [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}, [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}, [\s0], \strd +.endif +.endm +.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro interleave_1 wd, r0, r1, r2, r3, r4 + trn1 \r0\wd, \r0\wd, \r1\wd + trn1 \r1\wd, \r1\wd, \r2\wd +.ifnb \r3 + trn1 \r2\wd, \r2\wd, \r3\wd + trn1 \r3\wd, \r3\wd, \r4\wd +.endif +.endm +.macro interleave_1_h r0, r1, r2, r3, r4 + interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_1_s r0, r1, r2, r3, r4 + interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 + trn1 \r0\wd, \r0\wd, \r2\wd + trn1 \r1\wd, \r1\wd, \r3\wd + trn1 \r2\wd, \r2\wd, \r4\wd + trn1 \r3\wd, \r3\wd, \r5\wd +.endm +.macro interleave_2_s r0, r1, r2, r3, r4, r5 + interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 +.endm +.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 + uxtl \r0\().8h, \r0\().8b + uxtl \r1\().8h, \r1\().8b +.ifnb \r2 + uxtl \r2\().8h, \r2\().8b + uxtl \r3\().8h, \r3\().8b +.endif +.ifnb \r4 + uxtl \r4\().8h, \r4\().8b +.endif +.ifnb \r5 + uxtl \r5\().8h, \r5\().8b +.endif +.ifnb \r6 + uxtl \r6\().8h, \r6\().8b +.endif +.endm +.macro mul_mla_4 d, s0, s1, s2, s3, wd + mul \d\wd, \s0\wd, v0.h[0] + mla \d\wd, \s1\wd, v0.h[1] + mla \d\wd, \s2\wd, v0.h[2] + mla \d\wd, \s3\wd, v0.h[3] +.endm +// Interleaving the mul/mla chains actually hurts performance +// significantly on Cortex A53, thus keeping mul/mla tightly +// chained like this. +.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().4h, \s0\().4h, v0.h[0] + mla \d0\().4h, \s1\().4h, v0.h[1] + mla \d0\().4h, \s2\().4h, v0.h[2] + mla \d0\().4h, \s3\().4h, v0.h[3] + mla \d0\().4h, \s4\().4h, v0.h[4] + mla \d0\().4h, \s5\().4h, v0.h[5] + mla \d0\().4h, \s6\().4h, v0.h[6] + mla \d0\().4h, \s7\().4h, v0.h[7] +.endm +.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] +.endm +.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mul \d1\().8h, \s1\().8h, v0.h[0] + mla \d1\().8h, \s2\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[6] + mla \d1\().8h, \s8\().8h, v0.h[7] +.endm +.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mul \d1\().8h, \s2\().8h, v0.h[0] + mla \d1\().8h, \s3\().8h, v0.h[1] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d1\().8h, \s8\().8h, v0.h[6] + mla \d1\().8h, \s9\().8h, v0.h[7] +.endm +.macro sqrshrun_b shift, r0, r1, r2, r3 + sqrshrun \r0\().8b, \r0\().8h, #\shift +.ifnb \r1 + sqrshrun \r1\().8b, \r1\().8h, #\shift +.endif +.ifnb \r2 + sqrshrun \r2\().8b, \r2\().8h, #\shift + sqrshrun \r3\().8b, \r3\().8h, #\shift +.endif +.endm +.macro srshr_h shift, r0, r1, r2, r3 + srshr \r0\().8h, \r0\().8h, #\shift +.ifnb \r1 + srshr \r1\().8h, \r1\().8h, #\shift +.endif +.ifnb \r2 + srshr \r2\().8h, \r2\().8h, #\shift + srshr \r3\().8h, \r3\().8h, #\shift +.endif +.endm +.macro st_h strd, reg, lanes + st1 {\reg\().h}[0], [x0], \strd + st1 {\reg\().h}[1], [x8], \strd +.if \lanes > 2 + st1 {\reg\().h}[2], [x0], \strd + st1 {\reg\().h}[3], [x8], \strd +.endif +.endm +.macro st_s strd, r0, r1 + st1 {\r0\().s}[0], [x0], \strd + st1 {\r0\().s}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().s}[0], [x0], \strd + st1 {\r1\().s}[1], [x8], \strd +.endif +.endm +.macro st_d strd, r0, r1 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().d}[1], [x8], \strd +.endif +.endm +.macro shift_store_4 type, strd, r0, r1 +.ifc \type, put + sqrshrun_b 6, \r0, \r1 + st_s \strd, \r0, \r1 +.else + srshr_h 2, \r0, \r1 + st_d \strd, \r0, \r1 +.endif +.endm +.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 + st1 {\r0\wd}, [x0], \strd + st1 {\r1\wd}, [x8], \strd +.ifnb \r2 + st1 {\r2\wd}, [x0], \strd + st1 {\r3\wd}, [x8], \strd +.endif +.ifnb \r4 + st1 {\r4\wd}, [x0], \strd + st1 {\r5\wd}, [x8], \strd + st1 {\r6\wd}, [x0], \strd + st1 {\r7\wd}, [x8], \strd +.endif +.endm +.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro shift_store_8 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_b 6, \r0, \r1, \r2, \r3 + st_8b \strd, \r0, \r1, \r2, \r3 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st_16b \strd, \r0, \r1, \r2, \r3 +.endif +.endm +.macro shift_store_16 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun \r0\().8b, \r0\().8h, #6 + sqrshrun2 \r0\().16b, \r1\().8h, #6 + sqrshrun \r2\().8b, \r2\().8h, #6 + sqrshrun2 \r2\().16b, \r3\().8h, #6 + st_16b \strd, \r0, \r2 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st1 {\r0\().8h, \r1\().8h}, [x0], \strd + st1 {\r2\().8h, \r3\().8h}, [x8], \strd +.endif +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_8bpc_neon, export=1 + mov x8, \type_h + mov x9, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, w10 + mul \my, \my, w10 + add \mx, \mx, w8 // mx, 8tap_h, 4tap_h + add \my, \my, w9 // my, 8tap_v, 4tap_v +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + tst \mx, #(0x7f << 14) + sub w8, w8, #24 + movrel x10, X(mc_subpel_filters), -8 + b.ne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + b.ne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w9 +4: + tst \my, #(0x7f << 14) + add \xmx, x10, \mx, uxtw #3 + b.ne L(\type\()_8tap_hv) + + adr x9, L(\type\()_8tap_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +2: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + uxtl v4.8h, v4.8b + uxtl v6.8h, v6.8b + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + subs \h, \h, #2 + trn1 v3.2s, v4.2s, v6.2s + trn2 v6.2s, v4.2s, v6.2s + trn1 v4.2s, v5.2s, v7.2s + trn2 v7.2s, v5.2s, v7.2s + mul v3.4h, v3.4h, v0.h[0] + mla v3.4h, v4.4h, v0.h[1] + mla v3.4h, v6.4h, v0.h[2] + mla v3.4h, v7.4h, v0.h[3] + srshr v3.4h, v3.4h, #2 + sqrshrun v3.8b, v3.8h, #4 + st1 {v3.h}[0], [\dst], \d_strd + st1 {v3.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + AARCH64_VALID_JUMP_TARGET + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +4: + ld1 {v16.8b}, [\src], \s_strd + ld1 {v20.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v20.8h, v20.8b + ext v17.16b, v16.16b, v16.16b, #2 + ext v18.16b, v16.16b, v16.16b, #4 + ext v19.16b, v16.16b, v16.16b, #6 + ext v21.16b, v20.16b, v20.16b, #2 + ext v22.16b, v20.16b, v20.16b, #4 + ext v23.16b, v20.16b, v20.16b, #6 + subs \h, \h, #2 + mul v16.4h, v16.4h, v0.h[0] + mla v16.4h, v17.4h, v0.h[1] + mla v16.4h, v18.4h, v0.h[2] + mla v16.4h, v19.4h, v0.h[3] + mul v20.4h, v20.4h, v0.h[0] + mla v20.4h, v21.4h, v0.h[1] + mla v20.4h, v22.4h, v0.h[2] + mla v20.4h, v23.4h, v0.h[3] + srshr v16.4h, v16.4h, #2 + srshr v20.4h, v20.4h, #2 +.ifc \type, put + sqrshrun v16.8b, v16.8h, #4 + sqrshrun v20.8b, v20.8h, #4 + st1 {v16.s}[0], [\dst], \d_strd + st1 {v20.s}[0], [\ds2], \d_strd +.else + st1 {v16.4h}, [\dst], \d_strd + st1 {v20.4h}, [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +8: + ld1 {v16.8b, v17.8b}, [\src], \s_strd + ld1 {v20.8b, v21.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + + mul v18.8h, v16.8h, v0.h[0] + mul v22.8h, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr + subs \h, \h, #2 + srshr v18.8h, v18.8h, #2 + srshr v22.8h, v22.8h, #2 +.ifc \type, put + sqrshrun v18.8b, v18.8h, #4 + sqrshrun v22.8b, v22.8h, #4 + st1 {v18.8b}, [\dst], \d_strd + st1 {v22.8b}, [\ds2], \d_strd +.else + st1 {v18.8h}, [\dst], \d_strd + st1 {v22.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 + ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 + mov \mx, \w + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + +16: + mul v24.8h, v16.8h, v0.h[0] + mul v25.8h, v17.8h, v0.h[0] + mul v26.8h, v20.8h, v0.h[0] + mul v27.8h, v21.8h, v0.h[0] +.irpc i, 1234567 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + srshr v26.8h, v26.8h, #2 + srshr v27.8h, v27.8h, #2 + subs \mx, \mx, #16 +.ifc \type, put + sqrshrun v24.8b, v24.8h, #4 + sqrshrun2 v24.16b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun2 v26.16b, v27.8h, #4 + st1 {v24.16b}, [\dst], #16 + st1 {v26.16b}, [\ds2], #16 +.else + st1 {v24.8h, v25.8h}, [\dst], #32 + st1 {v26.8h, v27.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + ld1 {v17.8b, v18.8b}, [\src], #16 + ld1 {v21.8b, v22.8b}, [\sr2], #16 + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_8tap_h_tbl): + .hword L(\type\()_8tap_h_tbl) - 1280b + .hword L(\type\()_8tap_h_tbl) - 640b + .hword L(\type\()_8tap_h_tbl) - 320b + .hword L(\type\()_8tap_h_tbl) - 160b + .hword L(\type\()_8tap_h_tbl) - 80b + .hword L(\type\()_8tap_h_tbl) - 40b + .hword L(\type\()_8tap_h_tbl) - 20b + .hword 0 + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + b.gt 28f + + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + // 2x2 v + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_h v1, v2, v3, v4, v5 + b.gt 24f + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .4h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 2 + ret + +24: // 2x4 v + load_h \sr2, \src, \s_strd, v6, v7 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 4 + ret + +28: // 2x6, 2x8, 2x12, 2x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 + interleave_1_h v1, v2, v3, v4, v5 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 +216: + subs \h, \h, #4 + load_h \sr2, \src, \s_strd, v16, v17, v18, v19 + interleave_1_h v7, v16, v17, v18, v19 + interleave_2_s v5, v6, v7, v16, v17, v18 + uxtl_b v5, v6, v7, v16 + mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_b 6, v30 + st_h \d_strd, v30, 4 + b.le 0f + cmp \h, #2 + mov v1.16b, v5.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + mov v4.16b, v16.16b + mov v5.16b, v17.16b + mov v6.16b, v18.16b + mov v7.16b, v19.16b + b.eq 26f + b 216b +26: + load_h \sr2, \src, \s_strd, v16, v17 + interleave_1_h v7, v16, v17 + uxtl_b v5, v6, v7, v16 + mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_b 6, v30 + st_h \d_strd, v30, 2 +0: + ret +.endif + +40: + AARCH64_VALID_JUMP_TARGET + b.gt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_s v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + shift_store_4 \type, \d_strd, v6 + b.le 0f + load_s \sr2, \src, \s_strd, v6, v7 + interleave_1_s v5, v6, v7 + uxtl_b v5, v6 + mul_mla_4 v7, v3, v4, v5, v6, .8h + shift_store_4 \type, \d_strd, v7 +0: + ret + +480: // 4x6, 4x8, 4x12, 4x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + interleave_1_s v16, v17, v18 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v16, v17 + uxtl_b v18, v19, v20, v21 + +48: + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v23, v24, v25, v26 + interleave_1_s v22, v23, v24, v25, v26 + uxtl_b v22, v23, v24, v25 + mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + shift_store_4 \type, \d_strd, v1, v2 + b.le 0f + load_s \sr2, \src, \s_strd, v27, v16 + subs \h, \h, #2 + interleave_1_s v26, v27, v16 + uxtl_b v26, v27 + mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 + shift_store_4 \type, \d_strd, v1 + b.le 0f + load_s \sr2, \src, \s_strd, v17, v18 + subs \h, \h, #2 + interleave_1_s v16, v17, v18 + uxtl_b v16, v17 + mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 + shift_store_4 \type, \d_strd, v2 + b.le 0f + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v19, v20, v21, v22 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v18, v19, v20, v21 + mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + shift_store_4 \type, \d_strd, v1, v2 + b.gt 48b +0: + ret + +80: + AARCH64_VALID_JUMP_TARGET + b.gt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4, v5 + mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4 v7, v2, v3, v4, v5, .8h + shift_store_8 \type, \d_strd, v6, v7 + b.le 0f + load_8b \sr2, \src, \s_strd, v6, v7 + uxtl_b v6, v7 + mul_mla_4 v1, v3, v4, v5, v6, .8h + mul_mla_4 v2, v4, v5, v6, v7, .8h + shift_store_8 \type, \d_strd, v1, v2 +0: + ret + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmy] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + uxtl_b v16, v17, v18, v19, v20, v21, v22 + +88: + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v23, v24 + uxtl_b v23, v24 + mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v25, v26 + uxtl_b v25, v26 + mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_8 \type, \d_strd, v3, v4 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v27, v16 + uxtl_b v27, v16 + mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v17, v18 + uxtl_b v17, v18 + mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + shift_store_8 \type, \d_strd, v3, v4 + b.le 9f + subs \h, \h, #4 + load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 + uxtl_b v19, v20, v21, v22 + mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.gt 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + ret + +160: + AARCH64_VALID_JUMP_TARGET + b.gt 1680b + + // 16x2, 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + cmp \h, #2 + load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl v16.8h, v1.8b + uxtl v17.8h, v2.8b + uxtl v18.8h, v3.8b + uxtl v19.8h, v4.8b + uxtl v20.8h, v5.8b + uxtl2 v23.8h, v1.16b + uxtl2 v24.8h, v2.16b + uxtl2 v25.8h, v3.16b + uxtl2 v26.8h, v4.16b + uxtl2 v27.8h, v5.16b + mul_mla_4 v1, v16, v17, v18, v19, .8h + mul_mla_4 v16, v17, v18, v19, v20, .8h + mul_mla_4 v2, v23, v24, v25, v26, .8h + mul_mla_4 v17, v24, v25, v26, v27, .8h + shift_store_16 \type, \d_strd, v1, v2, v16, v17 + b.le 0f + load_16b \sr2, \src, \s_strd, v6, v7 + uxtl v21.8h, v6.8b + uxtl v22.8h, v7.8b + uxtl2 v28.8h, v6.16b + uxtl2 v29.8h, v7.16b + mul_mla_4 v1, v18, v19, v20, v21, .8h + mul_mla_4 v3, v19, v20, v21, v22, .8h + mul_mla_4 v2, v25, v26, v27, v28, .8h + mul_mla_4 v4, v26, v27, v28, v29, .8h + shift_store_16 \type, \d_strd, v1, v2, v3, v4 +0: + ret + +L(\type\()_8tap_v_tbl): + .hword L(\type\()_8tap_v_tbl) - 1280b + .hword L(\type\()_8tap_v_tbl) - 640b + .hword L(\type\()_8tap_v_tbl) - 320b + .hword L(\type\()_8tap_v_tbl) - 160b + .hword L(\type\()_8tap_v_tbl) - 80b + .hword L(\type\()_8tap_v_tbl) - 40b + .hword L(\type\()_8tap_v_tbl) - 20b + .hword 0 + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 280f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + + // 2x2, 2x4 hv + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addp v28.4h, v28.4h, v29.4h + addp v16.4h, v28.4h, v28.4h + srshr v16.4h, v16.4h, #2 + bl L(\type\()_8tap_filter_2) + + trn1 v16.2s, v16.2s, v28.2s + mov v17.8b, v28.8b + +2: + bl L(\type\()_8tap_filter_2) + + ext v18.8b, v17.8b, v28.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v28.8b + b 2b + +280: // 2x8, 2x16, 2x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addp v28.4h, v28.4h, v29.4h + addp v16.4h, v28.4h, v28.4h + srshr v16.4h, v16.4h, #2 + + bl L(\type\()_8tap_filter_2) + trn1 v16.2s, v16.2s, v28.2s + mov v17.8b, v28.8b + bl L(\type\()_8tap_filter_2) + ext v18.8b, v17.8b, v28.8b, #4 + mov v19.8b, v28.8b + bl L(\type\()_8tap_filter_2) + ext v20.8b, v19.8b, v28.8b, #4 + mov v21.8b, v28.8b + +28: + bl L(\type\()_8tap_filter_2) + ext v22.8b, v21.8b, v28.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v28.8b + b 28b + +0: + ret x15 + +L(\type\()_8tap_filter_2): + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v30.8h, v30.8b + ext v29.16b, v28.16b, v28.16b, #2 + ext v31.16b, v30.16b, v30.16b, #2 + trn1 v27.2s, v28.2s, v30.2s + trn2 v30.2s, v28.2s, v30.2s + trn1 v28.2s, v29.2s, v31.2s + trn2 v31.2s, v29.2s, v31.2s + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v30.4h, v0.h[2] + mla v27.4h, v31.4h, v0.h[3] + srshr v28.4h, v27.4h, #2 + ret +.endif + +40: + AARCH64_VALID_JUMP_TARGET + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 480f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + // 4x2, 4x4 hv + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + +4: + bl L(\type\()_8tap_filter_4) + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v28.4h, v1.h[2] + smlal v3.4s, v29.4h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v28.8b + mov v18.8b, v29.8b + b 4b + +480: // 4x8, 4x16, 4x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v19.8b, v28.8b + mov v20.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v21.8b, v28.8b + mov v22.8b, v29.8b + +48: + bl L(\type\()_8tap_filter_4) + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v19.4h, v1.h[2] + smlal v3.4s, v20.4h, v1.h[3] + smlal v3.4s, v21.4h, v1.h[4] + smlal v3.4s, v22.4h, v1.h[5] + smlal v3.4s, v28.4h, v1.h[6] + smlal v3.4s, v29.4h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v28.8b + mov v22.8b, v29.8b + b 48b +0: + ret x15 + +L(\type\()_8tap_filter_4): + ld1 {v26.8b}, [\sr2], \s_strd + ld1 {v27.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + ext v28.16b, v27.16b, v27.16b, #2 + ext v29.16b, v27.16b, v27.16b, #4 + ext v30.16b, v27.16b, v27.16b, #6 + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v29.4h, v0.h[2] + mla v27.4h, v30.4h, v0.h[3] + srshr v28.4h, v31.4h, #2 + srshr v29.4h, v27.4h, #2 + ret + +80: +160: +320: + AARCH64_VALID_JUMP_TARGET + b.gt 880f + add \xmy, \xmy, #2 + ld1 {v0.8b}, [\xmx] + ld1 {v1.s}[0], [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + bl L(\type\()_8tap_filter_8_first) + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + +8: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v24.4h, v1.h[2] + smlal2 v5.4s, v24.8h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v24.16b + mov v18.16b, v25.16b + b 8b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmx] + ld1 {v1.8b}, [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + bl L(\type\()_8tap_filter_8_first) + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v19.16b, v24.16b + mov v20.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v21.16b, v24.16b + mov v22.16b, v25.16b + +88: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v19.4h, v1.h[2] + smlal2 v5.4s, v19.8h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal v4.4s, v20.4h, v1.h[3] + smlal2 v5.4s, v20.8h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal2 v3.4s, v20.8h, v1.h[4] + smlal v4.4s, v21.4h, v1.h[4] + smlal2 v5.4s, v21.8h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal2 v3.4s, v21.8h, v1.h[5] + smlal v4.4s, v22.4h, v1.h[5] + smlal2 v5.4s, v22.8h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal2 v3.4s, v22.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] + smlal v2.4s, v24.4h, v1.h[7] + smlal2 v3.4s, v24.8h, v1.h[7] + smlal v4.4s, v25.4h, v1.h[7] + smlal2 v5.4s, v25.8h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v24.16b + mov v22.16b, v25.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + ret x15 + +L(\type\()_8tap_filter_8_first): + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v16.8h, v28.8h, v0.h[0] + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mla v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] + mla v16.8h, v26.8h, v0.h[7] + srshr v16.8h, v16.8h, #2 + ret + +L(\type\()_8tap_filter_8): + ld1 {v28.8b, v29.8b}, [\sr2], \s_strd + ld1 {v30.8b, v31.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + mul v24.8h, v28.8h, v0.h[0] + mul v25.8h, v30.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + ret + +L(\type\()_8tap_hv_tbl): + .hword L(\type\()_8tap_hv_tbl) - 1280b + .hword L(\type\()_8tap_hv_tbl) - 640b + .hword L(\type\()_8tap_hv_tbl) - 320b + .hword L(\type\()_8tap_hv_tbl) - 160b + .hword L(\type\()_8tap_hv_tbl) - 80b + .hword L(\type\()_8tap_hv_tbl) - 40b + .hword L(\type\()_8tap_hv_tbl) - 20b + .hword 0 +endfunc + + +function \type\()_bilin_8bpc_neon, export=1 + dup v1.16b, \mx + dup v3.16b, \my + mov w9, #16 + sub w8, w9, \mx + sub w9, w9, \my + dup v0.16b, w8 + dup v2.16b, w9 +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + sub w8, w8, #24 + cbnz \mx, L(\type\()_bilin_h) + cbnz \my, L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cbnz \my, L(\type\()_bilin_hv) + + adr x9, L(\type\()_bilin_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + ld1 {v4.s}[0], [\src], \s_strd + ld1 {v6.s}[0], [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.4h, v4.4h, v6.4h + trn1 v5.4h, v5.4h, v7.4h + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.2s, v4.2s, v6.2s + trn1 v5.2s, v5.2s, v7.2s + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + ld1 {v4.16b}, [\src], \s_strd + ld1 {v6.16b}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #1 + ext v7.16b, v6.16b, v6.16b, #1 + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umull v6.8h, v6.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + umlal v6.8h, v7.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v6.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v6.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.d}[1], [\src], #8 + ld1 {v20.d}[1], [\sr2], #8 + mov \mx, \w + +16: + ld1 {v18.16b}, [\src], #16 + ld1 {v22.16b}, [\sr2], #16 + ext v17.16b, v16.16b, v18.16b, #8 + ext v19.16b, v16.16b, v18.16b, #9 + ext v21.16b, v20.16b, v22.16b, #8 + ext v23.16b, v20.16b, v22.16b, #9 + umull v16.8h, v17.8b, v0.8b + umull2 v17.8h, v17.16b, v0.16b + umull v20.8h, v21.8b, v0.8b + umull2 v21.8h, v21.16b, v0.16b + umlal v16.8h, v19.8b, v1.8b + umlal2 v17.8h, v19.16b, v1.16b + umlal v20.8h, v23.8b, v1.8b + umlal2 v21.8h, v23.16b, v1.16b + subs \mx, \mx, #16 +.ifc \type, put + uqrshrn v16.8b, v16.8h, #4 + uqrshrn2 v16.16b, v17.8h, #4 + uqrshrn v20.8b, v20.8h, #4 + uqrshrn2 v20.16b, v21.8h, #4 + st1 {v16.16b}, [\dst], #16 + st1 {v20.16b}, [\ds2], #16 +.else + st1 {v16.8h, v17.8h}, [\dst], #32 + st1 {v20.8h, v21.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_bilin_h_tbl): + .hword L(\type\()_bilin_h_tbl) - 1280b + .hword L(\type\()_bilin_h_tbl) - 640b + .hword L(\type\()_bilin_h_tbl) - 320b + .hword L(\type\()_bilin_h_tbl) - 160b + .hword L(\type\()_bilin_h_tbl) - 80b + .hword L(\type\()_bilin_h_tbl) - 40b + .hword L(\type\()_bilin_h_tbl) - 20b + .hword 0 + + +L(\type\()_bilin_v): + cmp \h, #4 + adr x9, L(\type\()_bilin_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + ld1 {v16.h}[0], [\src], \s_strd + b.gt 24f +22: + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst] + st1 {v4.h}[1], [\ds2] + ret +24: // 2x4, 2x6, 2x8, ... v + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + ld1 {v19.h}[0], [\sr2], \s_strd + ld1 {v20.h}[0], [\src], \s_strd + sub \h, \h, #4 + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + trn1 v18.4h, v18.4h, v19.4h + trn1 v19.4h, v19.4h, v20.4h + trn1 v16.2s, v16.2s, v18.2s + trn1 v17.2s, v17.2s, v19.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + cmp \h, #2 + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + st1 {v4.h}[2], [\dst], \d_strd + st1 {v4.h}[3], [\ds2], \d_strd + b.lt 0f + mov v16.8b, v20.8b + b.eq 22b + b 24b +0: + ret +.endif + +40: // 4xN v + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.s}[0], [\src], \s_strd +4: + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 4b +0: + ret + +80: // 8xN v + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.8b}, [\src], \s_strd +8: + ld1 {v17.8b}, [\sr2], \s_strd + ld1 {v18.8b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull v5.8h, v17.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + umlal v5.8h, v18.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v5.8b, v5.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 8b +0: + ret + +160: // 16xN, 32xN, ... +320: +640: +1280: + AARCH64_VALID_JUMP_TARGET + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v16.16b}, [\src], \s_strd +2: + ld1 {v17.16b}, [\sr2], \s_strd + ld1 {v18.16b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull2 v5.8h, v16.16b, v2.16b + umull v6.8h, v17.8b, v2.8b + umull2 v7.8h, v17.16b, v2.16b + umlal v4.8h, v17.8b, v3.8b + umlal2 v5.8h, v17.16b, v3.16b + umlal v6.8h, v18.8b, v3.8b + umlal2 v7.8h, v18.16b, v3.16b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn2 v4.16b, v5.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + uqrshrn2 v6.16b, v7.8h, #4 + st1 {v4.16b}, [\dst], \d_strd + st1 {v6.16b}, [\ds2], \d_strd +.else + st1 {v4.8h, v5.8h}, [\dst], \d_strd + st1 {v6.8h, v7.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #16 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 +.ifc \type, put + add \dst, \dst, #16 +.else + add \dst, \dst, #32 +.endif + b 1b +0: + ret + +L(\type\()_bilin_v_tbl): + .hword L(\type\()_bilin_v_tbl) - 1280b + .hword L(\type\()_bilin_v_tbl) - 640b + .hword L(\type\()_bilin_v_tbl) - 320b + .hword L(\type\()_bilin_v_tbl) - 160b + .hword L(\type\()_bilin_v_tbl) - 80b + .hword L(\type\()_bilin_v_tbl) - 40b + .hword L(\type\()_bilin_v_tbl) - 20b + .hword 0 + +L(\type\()_bilin_hv): + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + adr x9, L(\type\()_bilin_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN hv + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.s}[0], [\sr2], \s_strd + ld1 {v30.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.4h, v28.4h, v30.4h + trn1 v29.4h, v29.4h, v31.4h + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2s, v16.2s, v17.2s + + mul v4.4h, v16.4h, v2.4h + mla v4.4h, v17.4h, v3.4h + uqrshrn v4.8b, v4.8h, #8 + subs \h, \h, #2 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2s, v17.2s, v17.2s + b 2b +0: + ret +.endif + +40: // 4xN hv + AARCH64_VALID_JUMP_TARGET + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +4: + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.2s, v28.2s, v30.2s + trn1 v29.2s, v29.2s, v31.2s + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2d, v16.2d, v17.2d + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + trn2 v16.2d, v17.2d, v17.2d + b 4b +0: + ret + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + AARCH64_VALID_JUMP_TARGET + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.16b}, [\sr2], \s_strd + ld1 {v30.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + ext v31.16b, v30.16b, v30.16b, #1 + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + umull v18.8h, v30.8b, v0.8b + umlal v18.8h, v31.8b, v1.8b + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v18.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + uqrshrn v5.8b, v5.8h, #8 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 1b +0: + ret + +L(\type\()_bilin_hv_tbl): + .hword L(\type\()_bilin_hv_tbl) - 1280b + .hword L(\type\()_bilin_hv_tbl) - 640b + .hword L(\type\()_bilin_hv_tbl) - 320b + .hword L(\type\()_bilin_hv_tbl) - 160b + .hword L(\type\()_bilin_hv_tbl) - 80b + .hword L(\type\()_bilin_hv_tbl) - 40b + .hword L(\type\()_bilin_hv_tbl) - 20b + .hword 0 +endfunc +.endm + +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 + +.macro load_filter_row dst, src, inc + asr w13, \src, #10 + add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] +.endm + +function warp_filter_horz_neon + add w12, w5, #512 + + ld1 {v16.8b, v17.8b}, [x2], x3 + + load_filter_row d0, w12, w7 + load_filter_row d1, w12, w7 + load_filter_row d2, w12, w7 + load_filter_row d3, w12, w7 + load_filter_row d4, w12, w7 + load_filter_row d5, w12, w7 + load_filter_row d6, w12, w7 + // subtract by 128 to allow using smull + eor v16.8b, v16.8b, v22.8b + eor v17.8b, v17.8b, v22.8b + load_filter_row d7, w12, w7 + + ext v18.8b, v16.8b, v17.8b, #1 + ext v19.8b, v16.8b, v17.8b, #2 + smull v0.8h, v0.8b, v16.8b + smull v1.8h, v1.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #3 + ext v20.8b, v16.8b, v17.8b, #4 + smull v2.8h, v2.8b, v19.8b + smull v3.8h, v3.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #5 + ext v19.8b, v16.8b, v17.8b, #6 + smull v4.8h, v4.8b, v20.8b + smull v5.8h, v5.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #7 + smull v6.8h, v6.8b, v19.8b + smull v7.8h, v7.8b, v18.8b + + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + + addp v0.8h, v0.8h, v4.8h + + add w5, w5, w8 + + ret +endfunc + +// void dav1d_warp_affine_8x8_8bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my) +.macro warp t, shift +function warp_affine_8x8\t\()_8bpc_neon, export=1 + ldr x4, [x4] + sbfx x7, x4, #0, #16 + sbfx x8, x4, #16, #16 + sbfx x9, x4, #32, #16 + sbfx x4, x4, #48, #16 + mov w10, #8 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + sub x2, x2, #3 + movrel x11, X(mc_warp_filter), 64*8 + mov x15, x30 +.ifnb \t + lsl x1, x1, #1 +.endif + + movi v22.8b, #128 +.ifb \t + movi v23.8h, #128 +.else + movi v23.8h, #8, lsl #8 +.endif + + bl warp_filter_horz_neon + srshr v24.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v25.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v26.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v27.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v28.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v29.8h, v0.8h, #3 + bl warp_filter_horz_neon + srshr v30.8h, v0.8h, #3 + +1: + add w14, w6, #512 + bl warp_filter_horz_neon + srshr v31.8h, v0.8h, #3 + + load_filter_row d0, w14, w9 + load_filter_row d1, w14, w9 + load_filter_row d2, w14, w9 + load_filter_row d3, w14, w9 + load_filter_row d4, w14, w9 + load_filter_row d5, w14, w9 + load_filter_row d6, w14, w9 + load_filter_row d7, w14, w9 + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl + + // This ordering of smull/smlal/smull2/smlal2 is highly + // beneficial for Cortex A53 here. + smull v16.4s, v24.4h, v0.4h + smlal v16.4s, v25.4h, v1.4h + smlal v16.4s, v26.4h, v2.4h + smlal v16.4s, v27.4h, v3.4h + smlal v16.4s, v28.4h, v4.4h + smlal v16.4s, v29.4h, v5.4h + smlal v16.4s, v30.4h, v6.4h + smlal v16.4s, v31.4h, v7.4h + smull2 v17.4s, v24.8h, v0.8h + smlal2 v17.4s, v25.8h, v1.8h + smlal2 v17.4s, v26.8h, v2.8h + smlal2 v17.4s, v27.8h, v3.8h + smlal2 v17.4s, v28.8h, v4.8h + smlal2 v17.4s, v29.8h, v5.8h + smlal2 v17.4s, v30.8h, v6.8h + smlal2 v17.4s, v31.8h, v7.8h + + mov v24.16b, v25.16b + mov v25.16b, v26.16b + sqrshrn v16.4h, v16.4s, #\shift + mov v26.16b, v27.16b + sqrshrn2 v16.8h, v17.4s, #\shift + mov v27.16b, v28.16b + mov v28.16b, v29.16b + add v16.8h, v16.8h, v23.8h +.ifb \t + sqxtun v16.8b, v16.8h +.endif + mov v29.16b, v30.16b + mov v30.16b, v31.16b + subs w10, w10, #1 +.ifnb \t + st1 {v16.8h}, [x0], x1 +.else + st1 {v16.8b}, [x0], x1 +.endif + + add w6, w6, w4 + b.gt 1b + + ret x15 +endfunc +.endm + +warp , 11 +warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.16b}, [x8] + mov x12, x6 // out = dst + mov x3, x4 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.16b, v1.16b}, [x13], #32 + subs x3, x3, #32 + st1 {v0.16b, v1.16b}, [x12], #32 + b.gt 1b +.if \need_right + add x3, x8, x2 // in + center_w + sub x3, x3, #1 // in + center_w - 1 + add x12, x6, x4 // dst + left_ext + ld1r {v0.16b}, [x3] + add x12, x12, x2 // out = dst + left_ext + center_w + mov x3, x11 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.16b, v1.16b}, [x8], #32 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.16b, v1.16b}, [x14], #32 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S new file mode 100644 index 0000000000..1bfb12ebb3 --- /dev/null +++ b/third_party/dav1d/src/arm/64/mc16.S @@ -0,0 +1,3611 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d1, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sqadd \t0\().8h, \t0\().8h, \t2\().8h + sqadd \t1\().8h, \t1\().8h, \t3\().8h + smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits + sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) + sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d1, t0, t1, t2, t3 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + ssubl \d0\().4s, \t2\().4h, \t0\().4h + ssubl2 \t0\().4s, \t2\().8h, \t0\().8h + ssubl \d1\().4s, \t3\().4h, \t1\().4h + ssubl2 \t1\().4s, \t3\().8h, \t1\().8h + mul \d0\().4s, \d0\().4s, v27.4s + mul \t0\().4s, \t0\().4s, v27.4s + mul \d1\().4s, \d1\().4s, v27.4s + mul \t1\().4s, \t1\().4s, v27.4s + sshr \d0\().4s, \d0\().4s, #4 + sshr \t0\().4s, \t0\().4s, #4 + sshr \d1\().4s, \d1\().4s, #4 + sshr \t1\().4s, \t1\().4s, #4 + saddw \d0\().4s, \d0\().4s, \t2\().4h + saddw2 \t0\().4s, \t0\().4s, \t2\().8h + saddw \d1\().4s, \d1\().4s, \t3\().4h + saddw2 \t1\().4s, \t1\().4s, \t3\().8h + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto + srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits + srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits + add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits + add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits + smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max + smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max + smax \d0\().8h, \d0\().8h, v30.8h // 0 + smax \d1\().8h, \d1\().8h, v30.8h // 0 +.endm + +.macro mask d0, d1, t0, t1, t2, t3 + ld1 {v27.16b}, [x6], 16 + ld1 {\t0\().8h,\t1\().8h}, [x2], 32 + neg v27.16b, v27.16b + ld1 {\t2\().8h,\t3\().8h}, [x3], 32 + sxtl v26.8h, v27.8b + sxtl2 v27.8h, v27.16b + sxtl v24.4s, v26.4h + sxtl2 v25.4s, v26.8h + sxtl v26.4s, v27.4h + sxtl2 v27.4s, v27.8h + ssubl \d0\().4s, \t2\().4h, \t0\().4h + ssubl2 \t0\().4s, \t2\().8h, \t0\().8h + ssubl \d1\().4s, \t3\().4h, \t1\().4h + ssubl2 \t1\().4s, \t3\().8h, \t1\().8h + mul \d0\().4s, \d0\().4s, v24.4s + mul \t0\().4s, \t0\().4s, v25.4s + mul \d1\().4s, \d1\().4s, v26.4s + mul \t1\().4s, \t1\().4s, v27.4s + sshr \d0\().4s, \d0\().4s, #6 + sshr \t0\().4s, \t0\().4s, #6 + sshr \d1\().4s, \d1\().4s, #6 + sshr \t1\().4s, \t1\().4s, #6 + saddw \d0\().4s, \d0\().4s, \t2\().4h + saddw2 \t0\().4s, \t0\().4s, \t2\().8h + saddw \d1\().4s, \d1\().4s, \t3\().4h + saddw2 \t1\().4s, \t1\().4s, \t3\().8h + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto + srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits + srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits + add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits + add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits + smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max + smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max + smax \d0\().8h, \d0\().8h, v30.8h // 0 + smax \d1\().8h, \d1\().8h, v30.8h // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + clz w4, w4 +.ifnc \type, avg + dup v31.8h, \bdmax // bitdepth_max + movi v30.8h, #0 +.endif + clz w7, \bdmax + sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov w9, #1 + mov w8, #-2*PREP_BIAS + lsl w9, w9, w7 // 1 << intermediate_bits + add w7, w7, #1 + sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits + neg w7, w7 // -(intermediate_bits+1) + dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits + dup v29.8h, w7 // -(intermediate_bits+1) +.else + mov w8, #PREP_BIAS + lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits + neg w7, w7 // -intermediate_bits + dup v28.8h, w8 // PREP_BIAS >> intermediate_bits + dup v29.8h, w7 // -intermediate_bits +.endif +.ifc \type, w_avg + dup v27.4s, w6 + neg v27.4s, v27.4s +.endif + adr x7, L(\type\()_tbl) + sub w4, w4, #24 + \type v4, v5, v0, v1, v2, v3 + ldrh w4, [x7, x4, lsl #1] + sub x7, x7, w4, uxtw + br x7 +40: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +4: + subs w5, w5, #4 + st1 {v4.d}[0], [x0], x1 + st1 {v4.d}[1], [x7], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v5.d}[1], [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 4b +80: + AARCH64_VALID_JUMP_TARGET + add x7, x0, x1 + lsl x1, x1, #1 +8: + st1 {v4.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v5.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 8b +16: + AARCH64_VALID_JUMP_TARGET + \type v6, v7, v0, v1, v2, v3 + st1 {v4.8h, v5.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v6.8h, v7.8h}, [x0], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 16b +32: + AARCH64_VALID_JUMP_TARGET + \type v6, v7, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 32b +640: + AARCH64_VALID_JUMP_TARGET + add x7, x0, #64 +64: + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + \type v18, v19, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 64b +1280: + AARCH64_VALID_JUMP_TARGET + add x7, x0, #64 + mov x8, #128 + sub x1, x1, #128 +128: + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 + \type v18, v19, v0, v1, v2, v3 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 + \type v4, v5, v0, v1, v2, v3 + \type v6, v7, v0, v1, v2, v3 + \type v16, v17, v0, v1, v2, v3 + subs w5, w5, #1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + \type v18, v19, v0, v1, v2, v3 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 + b.le 0f + \type v4, v5, v0, v1, v2, v3 + b 128b +0: + ret +L(\type\()_tbl): + .hword L(\type\()_tbl) - 1280b + .hword L(\type\()_tbl) - 640b + .hword L(\type\()_tbl) - 32b + .hword L(\type\()_tbl) - 16b + .hword L(\type\()_tbl) - 80b + .hword L(\type\()_tbl) - 40b +endfunc +.endm + +bidir_fn avg, w6 +bidir_fn w_avg, w7 +bidir_fn mask, w7 + + +.macro w_mask_fn type +function w_mask_\type\()_16bpc_neon, export=1 + ldr w8, [sp] + clz w9, w4 + adr x10, L(w_mask_\type\()_tbl) + dup v31.8h, w8 // bitdepth_max + sub w9, w9, #24 + clz w8, w8 // clz(bitdepth_max) + ldrh w9, [x10, x9, lsl #1] + sub x10, x10, w9, uxtw + sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 + mov w9, #PREP_BIAS*64 + neg w8, w8 // -sh + mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd + dup v30.4s, w9 // PREP_BIAS*64 + dup v29.4s, w8 // -sh + dup v0.8h, w11 +.if \type == 444 + movi v1.16b, #64 +.elseif \type == 422 + dup v2.8b, w7 + movi v3.8b, #129 + sub v3.8b, v3.8b, v2.8b +.elseif \type == 420 + dup v2.8h, w7 + movi v3.8h, #1, lsl #8 + sub v3.8h, v3.8h, v2.8h +.endif + add x12, x0, x1 + lsl x1, x1, #1 + br x10 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) + subs w5, w5, #4 + sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) + sabd v21.8h, v5.8h, v7.8h + ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v17.4s, v6.8h, v4.8h + ssubl v18.4s, v7.4h, v5.4h + ssubl2 v19.4s, v7.8h, v5.8h + uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() + uqsub v21.8h, v0.8h, v21.8h + sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 + sshll v6.4s, v5.4h, #6 + sshll2 v5.4s, v4.8h, #6 + sshll v4.4s, v4.4h, #6 + ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + uxtl v22.4s, v20.4h + uxtl2 v23.4s, v20.8h + uxtl v24.4s, v21.4h + uxtl2 v25.4s, v21.8h + mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) + mla v5.4s, v17.4s, v23.4s + mla v6.4s, v18.4s, v24.4s + mla v7.4s, v19.4s, v25.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v6.4s + sqxtun2 v5.8h, v7.4s + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h +.if \type == 444 + uzp1 v20.16b, v20.16b, v21.16b // 64 - m + sub v20.16b, v1.16b, v20.16b // m + st1 {v20.16b}, [x6], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + xtn v20.8b, v20.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + st1 {v20.8b}, [x6], #8 +.elseif \type == 420 + trn1 v24.2d, v20.2d, v21.2d + trn2 v25.2d, v20.2d, v21.2d + add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) + addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.s}[0], [x6], #4 +.endif + st1 {v4.d}[0], [x0], x1 + st1 {v4.d}[1], [x12], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v5.d}[1], [x12], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 + subs w5, w5, #2 + sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) + sabd v21.8h, v5.8h, v7.8h + ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v17.4s, v6.8h, v4.8h + ssubl v18.4s, v7.4h, v5.4h + ssubl2 v19.4s, v7.8h, v5.8h + uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() + uqsub v21.8h, v0.8h, v21.8h + sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 + sshll v6.4s, v5.4h, #6 + sshll2 v5.4s, v4.8h, #6 + sshll v4.4s, v4.4h, #6 + ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + uxtl v22.4s, v20.4h + uxtl2 v23.4s, v20.8h + uxtl v24.4s, v21.4h + uxtl2 v25.4s, v21.8h + mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) + mla v5.4s, v17.4s, v23.4s + mla v6.4s, v18.4s, v24.4s + mla v7.4s, v19.4s, v25.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v6.4s + sqxtun2 v5.8h, v7.4s + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h +.if \type == 444 + uzp1 v20.16b, v20.16b, v21.16b // 64 - m + sub v20.16b, v1.16b, v20.16b // m + st1 {v20.16b}, [x6], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + xtn v20.8b, v20.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + st1 {v20.8b}, [x6], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) + addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.s}[0], [x6], #4 +.endif + st1 {v4.8h}, [x0], x1 + st1 {v5.8h}, [x12], x1 + b.gt 8b + ret +1280: +640: +320: +160: + AARCH64_VALID_JUMP_TARGET + mov w11, w4 + sub x1, x1, w4, uxtw #1 +.if \type == 444 + add x10, x6, w4, uxtw +.elseif \type == 422 + add x10, x6, x11, lsr #1 +.endif + add x9, x3, w4, uxtw #1 + add x7, x2, w4, uxtw #1 +161: + mov w8, w4 +16: + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 + ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 + ld1 {v6.8h, v7.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x9], #32 + subs w8, w8, #16 + sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) + sabd v21.8h, v5.8h, v17.8h + ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v23.4s, v16.8h, v4.8h + ssubl v24.4s, v17.4h, v5.4h + ssubl2 v25.4s, v17.8h, v5.8h + uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() + uqsub v21.8h, v0.8h, v21.8h + sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 + sshll v26.4s, v5.4h, #6 + sshll2 v5.4s, v4.8h, #6 + sshll v4.4s, v4.4h, #6 + ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v21.8h, v21.8h, #10 + add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 + add v5.4s, v5.4s, v30.4s + add v26.4s, v26.4s, v30.4s + add v27.4s, v27.4s, v30.4s + uxtl v16.4s, v20.4h + uxtl2 v17.4s, v20.8h + uxtl v28.4s, v21.4h + mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) + uxtl2 v16.4s, v21.8h + mla v5.4s, v23.4s, v17.4s + mla v26.4s, v24.4s, v28.4s + mla v27.4s, v25.4s, v16.4s + srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v5.4s, v5.4s, v29.4s + srshl v26.4s, v26.4s, v29.4s + srshl v27.4s, v27.4s, v29.4s + sqxtun v4.4h, v4.4s // iclip_pixel + sqxtun2 v4.8h, v5.4s + sqxtun v5.4h, v26.4s + sqxtun2 v5.8h, v27.4s + + // Start of other half + sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) + sabd v23.8h, v7.8h, v19.8h + + umin v4.8h, v4.8h, v31.8h // iclip_pixel + umin v5.8h, v5.8h, v31.8h + + ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) + ssubl2 v17.4s, v18.8h, v6.8h + ssubl v18.4s, v19.4h, v7.4h + ssubl2 v19.4s, v19.8h, v7.8h + uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() + uqsub v23.8h, v0.8h, v23.8h + sshll v24.4s, v6.4h, #6 // tmp1 << 6 + sshll2 v25.4s, v6.8h, #6 + sshll v26.4s, v7.4h, #6 + sshll2 v27.4s, v7.8h, #6 + ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh + ushr v23.8h, v23.8h, #10 + add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 + add v25.4s, v25.4s, v30.4s + add v26.4s, v26.4s, v30.4s + add v27.4s, v27.4s, v30.4s + uxtl v6.4s, v22.4h + uxtl2 v7.4s, v22.8h + uxtl v28.4s, v23.4h + mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) + uxtl2 v6.4s, v23.8h + mla v25.4s, v17.4s, v7.4s + mla v26.4s, v18.4s, v28.4s + mla v27.4s, v19.4s, v6.4s + srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + srshl v25.4s, v25.4s, v29.4s + srshl v26.4s, v26.4s, v29.4s + srshl v27.4s, v27.4s, v29.4s + sqxtun v6.4h, v24.4s // iclip_pixel + sqxtun2 v6.8h, v25.4s + sqxtun v7.4h, v26.4s + sqxtun2 v7.8h, v27.4s + umin v6.8h, v6.8h, v31.8h // iclip_pixel + umin v7.8h, v7.8h, v31.8h +.if \type == 444 + uzp1 v20.16b, v20.16b, v21.16b // 64 - m + uzp1 v21.16b, v22.16b, v23.16b + sub v20.16b, v1.16b, v20.16b // m + sub v21.16b, v1.16b, v21.16b + st1 {v20.16b}, [x6], #16 + st1 {v21.16b}, [x10], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) + addp v21.8h, v22.8h, v23.8h + xtn v20.8b, v20.8h + xtn v21.8b, v21.8h + uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + uhsub v21.8b, v3.8b, v21.8b + st1 {v20.8b}, [x6], #8 + st1 {v21.8b}, [x10], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) + add v21.8h, v21.8h, v23.8h + addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) + sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) + rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + st1 {v20.8b}, [x6], #8 +.endif + st1 {v4.8h, v5.8h}, [x0], #32 + st1 {v6.8h, v7.8h}, [x12], #32 + b.gt 16b + subs w5, w5, #2 + add x2, x2, w4, uxtw #1 + add x3, x3, w4, uxtw #1 + add x7, x7, w4, uxtw #1 + add x9, x9, w4, uxtw #1 +.if \type == 444 + add x6, x6, w4, uxtw + add x10, x10, w4, uxtw +.elseif \type == 422 + add x6, x6, x11, lsr #1 + add x10, x10, x11, lsr #1 +.endif + add x0, x0, x1 + add x12, x12, x1 + b.gt 161b + ret +L(w_mask_\type\()_tbl): + .hword L(w_mask_\type\()_tbl) - 1280b + .hword L(w_mask_\type\()_tbl) - 640b + .hword L(w_mask_\type\()_tbl) - 320b + .hword L(w_mask_\type\()_tbl) - 160b + .hword L(w_mask_\type\()_tbl) - 8b + .hword L(w_mask_\type\()_tbl) - 4b +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + +function blend_16bpc_neon, export=1 + adr x6, L(blend_tbl) + clz w3, w3 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + add x8, x0, x1 + br x6 +40: + AARCH64_VALID_JUMP_TARGET + lsl x1, x1, #1 +4: + ld1 {v2.8b}, [x5], #8 + ld1 {v1.8h}, [x2], #16 + ld1 {v0.d}[0], [x0] + neg v2.8b, v2.8b // -m + subs w4, w4, #2 + ld1 {v0.d}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x8], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + lsl x1, x1, #1 +8: + ld1 {v4.16b}, [x5], #16 + ld1 {v2.8h, v3.8h}, [x2], #32 + neg v5.16b, v4.16b // -m + ld1 {v0.8h}, [x0] + ld1 {v1.8h}, [x8] + sxtl v4.8h, v5.8b + sxtl2 v5.8h, v5.16b + shl v4.8h, v4.8h, #9 // -m << 9 + shl v5.8h, v5.8h, #9 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + subs w4, w4, #2 + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v5.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + lsl x1, x1, #1 +16: + ld1 {v16.16b, v17.16b}, [x5], #32 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + subs w4, w4, #2 + neg v18.16b, v16.16b // -m + neg v19.16b, v17.16b + ld1 {v0.8h, v1.8h}, [x0] + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + ld1 {v2.8h, v3.8h}, [x8] + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + shl v18.8h, v18.8h, #9 + shl v19.8h, v19.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v17.8h + sqrdmulh v6.8h, v6.8h, v18.8h + sqrdmulh v7.8h, v7.8h, v19.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x8], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v16.16b, v17.16b}, [x5], #32 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + subs w4, w4, #1 + neg v18.16b, v16.16b // -m + neg v19.16b, v17.16b + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + shl v18.8h, v18.8h, #9 + shl v19.8h, v19.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v17.8h + sqrdmulh v6.8h, v6.8h, v18.8h + sqrdmulh v7.8h, v7.8h, v19.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 32b + ret +L(blend_tbl): + .hword L(blend_tbl) - 32b + .hword L(blend_tbl) - 160b + .hword L(blend_tbl) - 80b + .hword L(blend_tbl) - 40b +endfunc + +function blend_h_16bpc_neon, export=1 + adr x6, L(blend_h_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w4, uxtw + sub w4, w4, w4, lsr #2 + clz w7, w3 + add x8, x0, x1 + lsl x1, x1, #1 + sub w7, w7, #24 + ldrh w7, [x6, x7, lsl #1] + sub x6, x6, w7, uxtw + br x6 +2: + AARCH64_VALID_JUMP_TARGET + ld2r {v2.8b, v3.8b}, [x5], #2 + ld1 {v1.4h}, [x2], #8 + ext v2.8b, v2.8b, v3.8b, #6 + subs w4, w4, #2 + neg v2.8b, v2.8b // -m + ld1 {v0.s}[0], [x0] + ld1 {v0.s}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.4h, v2.4h, #9 // -m << 9 + sub v1.4h, v0.4h, v1.4h // a - b + sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 + add v0.4h, v0.4h, v1.4h + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x8], x1 + b.gt 2b + ret +4: + AARCH64_VALID_JUMP_TARGET + ld2r {v2.8b, v3.8b}, [x5], #2 + ld1 {v1.8h}, [x2], #16 + ext v2.8b, v2.8b, v3.8b, #4 + subs w4, w4, #2 + neg v2.8b, v2.8b // -m + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x8], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld2r {v4.8b, v5.8b}, [x5], #2 + ld1 {v2.8h, v3.8h}, [x2], #32 + neg v4.8b, v4.8b // -m + neg v5.8b, v5.8b + ld1 {v0.8h}, [x0] + subs w4, w4, #2 + sxtl v4.8h, v4.8b + sxtl v5.8h, v5.8b + ld1 {v1.8h}, [x8] + shl v4.8h, v4.8h, #9 // -m << 9 + shl v5.8h, v5.8h, #9 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v5.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld2r {v16.8b, v17.8b}, [x5], #2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + neg v16.8b, v16.8b // -m + neg v17.8b, v17.8b + ld1 {v0.8h, v1.8h}, [x0] + ld1 {v2.8h, v3.8h}, [x8] + subs w4, w4, #2 + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.8h, v17.8h, #9 + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.8h, v1.8h, v5.8h + sub v6.8h, v2.8h, v6.8h + sub v7.8h, v3.8h, v7.8h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.8h, v5.8h, v16.8h + sqrdmulh v6.8h, v6.8h, v17.8h + sqrdmulh v7.8h, v7.8h, v17.8h + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x8], x1 + b.gt 16b + ret +1280: +640: +320: + AARCH64_VALID_JUMP_TARGET + sub x1, x1, w3, uxtw #1 + add x7, x2, w3, uxtw #1 +321: + ld2r {v24.8b, v25.8b}, [x5], #2 + mov w6, w3 + neg v24.8b, v24.8b // -m + neg v25.8b, v25.8b + sxtl v24.8h, v24.8b + sxtl v25.8h, v25.8b + shl v24.8h, v24.8h, #9 // -m << 9 + shl v25.8h, v25.8h, #9 +32: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w6, w6, #32 + sub v16.8h, v0.8h, v16.8h // a - b + sub v17.8h, v1.8h, v17.8h + sub v18.8h, v2.8h, v18.8h + sub v19.8h, v3.8h, v19.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] + sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v17.8h, v17.8h, v24.8h + sqrdmulh v18.8h, v18.8h, v24.8h + sqrdmulh v19.8h, v19.8h, v24.8h + sub v20.8h, v4.8h, v20.8h // a - b + sub v21.8h, v5.8h, v21.8h + sub v22.8h, v6.8h, v22.8h + sub v23.8h, v7.8h, v23.8h + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v3.8h, v3.8h, v19.8h + sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v21.8h, v21.8h, v25.8h + sqrdmulh v22.8h, v22.8h, v25.8h + sqrdmulh v23.8h, v23.8h, v25.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + add v7.8h, v7.8h, v23.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 + b.gt 32b + subs w4, w4, #2 + add x0, x0, x1 + add x8, x8, x1 + add x2, x2, w3, uxtw #1 + add x7, x7, w3, uxtw #1 + b.gt 321b + ret +L(blend_h_tbl): + .hword L(blend_h_tbl) - 1280b + .hword L(blend_h_tbl) - 640b + .hword L(blend_h_tbl) - 320b + .hword L(blend_h_tbl) - 16b + .hword L(blend_h_tbl) - 8b + .hword L(blend_h_tbl) - 4b + .hword L(blend_h_tbl) - 2b +endfunc + +function blend_v_16bpc_neon, export=1 + adr x6, L(blend_v_tbl) + movrel x5, X(obmc_masks) + add x5, x5, w3, uxtw + clz w3, w3 + add x8, x0, x1 + lsl x1, x1, #1 + sub w3, w3, #26 + ldrh w3, [x6, x3, lsl #1] + sub x6, x6, w3, uxtw + br x6 +20: + AARCH64_VALID_JUMP_TARGET + ld1r {v2.8b}, [x5] + neg v2.8b, v2.8b // -m + sxtl v2.8h, v2.8b + shl v2.4h, v2.4h, #9 // -m << 9 +2: + ld1 {v1.s}[0], [x2], #4 + ld1 {v0.h}[0], [x0] + subs w4, w4, #2 + ld1 {v1.h}[1], [x2] + ld1 {v0.h}[1], [x8] + add x2, x2, #4 + sub v1.4h, v0.4h, v1.4h // a - b + sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 + add v0.4h, v0.4h, v1.4h + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x8], x1 + b.gt 2b + ret +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v2.2s}, [x5] + sub x1, x1, #4 + neg v2.8b, v2.8b // -m + sxtl v2.8h, v2.8b + shl v2.8h, v2.8h, #9 // -m << 9 +4: + ld1 {v1.8h}, [x2], #16 + ld1 {v0.d}[0], [x0] + ld1 {v0.d}[1], [x8] + subs w4, w4, #2 + sub v1.8h, v0.8h, v1.8h // a - b + sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 + add v0.8h, v0.8h, v1.8h + st1 {v0.s}[0], [x0], #4 + st1 {v0.s}[2], [x8], #4 + st1 {v0.h}[2], [x0], x1 + st1 {v0.h}[6], [x8], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8b}, [x5] + sub x1, x1, #8 + neg v4.8b, v4.8b // -m + sxtl v4.8h, v4.8b + shl v4.8h, v4.8h, #9 // -m << 9 +8: + ld1 {v2.8h, v3.8h}, [x2], #32 + ld1 {v0.8h}, [x0] + ld1 {v1.8h}, [x8] + subs w4, w4, #2 + sub v2.8h, v0.8h, v2.8h // a - b + sub v3.8h, v1.8h, v3.8h + sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v3.8h, v3.8h, v4.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + st1 {v0.d}[0], [x0], #8 + st1 {v1.d}[0], [x8], #8 + st1 {v0.s}[2], [x0], x1 + st1 {v1.s}[2], [x8], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v16.16b}, [x5] + sub x1, x1, #16 + neg v17.16b, v16.16b // -m + sxtl v16.8h, v17.8b + sxtl2 v17.8h, v17.16b + shl v16.8h, v16.8h, #9 // -m << 9 + shl v17.4h, v17.4h, #9 +16: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + ld1 {v0.8h, v1.8h}, [x0] + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x8] + sub v4.8h, v0.8h, v4.8h // a - b + sub v5.4h, v1.4h, v5.4h + sub v6.8h, v2.8h, v6.8h + sub v7.4h, v3.4h, v7.4h + sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v5.4h, v5.4h, v17.4h + sqrdmulh v6.8h, v6.8h, v16.8h + sqrdmulh v7.4h, v7.4h, v17.4h + add v0.8h, v0.8h, v4.8h + add v1.4h, v1.4h, v5.4h + add v2.8h, v2.8h, v6.8h + add v3.4h, v3.4h, v7.4h + st1 {v0.8h}, [x0], #16 + st1 {v2.8h}, [x8], #16 + st1 {v1.4h}, [x0], x1 + st1 {v3.4h}, [x8], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v24.16b, v25.16b}, [x5] + neg v26.16b, v24.16b // -m + neg v27.8b, v25.8b + sxtl v24.8h, v26.8b + sxtl2 v25.8h, v26.16b + sxtl v26.8h, v27.8b + shl v24.8h, v24.8h, #9 // -m << 9 + shl v25.8h, v25.8h, #9 + shl v26.8h, v26.8h, #9 +32: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h}, [x0] + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 + ld1 {v4.8h, v5.8h, v6.8h}, [x8] + subs w4, w4, #2 + sub v16.8h, v0.8h, v16.8h // a - b + sub v17.8h, v1.8h, v17.8h + sub v18.8h, v2.8h, v18.8h + sub v20.8h, v4.8h, v20.8h + sub v21.8h, v5.8h, v21.8h + sub v22.8h, v6.8h, v22.8h + sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 + sqrdmulh v17.8h, v17.8h, v25.8h + sqrdmulh v18.8h, v18.8h, v26.8h + sqrdmulh v20.8h, v20.8h, v24.8h + sqrdmulh v21.8h, v21.8h, v25.8h + sqrdmulh v22.8h, v22.8h, v26.8h + add v0.8h, v0.8h, v16.8h + add v1.8h, v1.8h, v17.8h + add v2.8h, v2.8h, v18.8h + add v4.8h, v4.8h, v20.8h + add v5.8h, v5.8h, v21.8h + add v6.8h, v6.8h, v22.8h + st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 + b.gt 32b + ret +L(blend_v_tbl): + .hword L(blend_v_tbl) - 320b + .hword L(blend_v_tbl) - 160b + .hword L(blend_v_tbl) - 80b + .hword L(blend_v_tbl) - 40b + .hword L(blend_v_tbl) - 20b +endfunc + + +// This has got the same signature as the put_8tap functions, +// and assumes that x9 is set to (clz(w)-24). +function put_neon + adr x10, L(put_tbl) + ldrh w9, [x10, x9, lsl #1] + sub x10, x10, w9, uxtw + br x10 + +2: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 2b + ret +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2], x3 + ld1 {v1.4h}, [x2], x3 + subs w5, w5, #2 + st1 {v0.4h}, [x0], x1 + st1 {v1.4h}, [x0], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + add x8, x0, x1 + lsl x1, x1, #1 + add x9, x2, x3 + lsl x3, x3, #1 +8: + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x9], x3 + subs w5, w5, #2 + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + subs w5, w5, #1 + stp x8, x9, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + ldp x10, x11, [x2, #32] + stp x8, x9, [x0, #16] + subs w5, w5, #1 + ldp x12, x13, [x2, #48] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x2, x2, x3 + add x0, x0, x1 + b.gt 64b + ret +128: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + ldp q16, q17, [x2, #128] + stp q6, q7, [x0, #96] + ldp q18, q19, [x2, #160] + stp q16, q17, [x0, #128] + ldp q20, q21, [x2, #192] + stp q18, q19, [x0, #160] + ldp q22, q23, [x2, #224] + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x2, x2, x3 + add x0, x0, x1 + b.gt 128b + ret + +L(put_tbl): + .hword L(put_tbl) - 128b + .hword L(put_tbl) - 64b + .hword L(put_tbl) - 32b + .hword L(put_tbl) - 16b + .hword L(put_tbl) - 80b + .hword L(put_tbl) - 4b + .hword L(put_tbl) - 2b +endfunc + + +// This has got the same signature as the prep_8tap functions, +// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and +// x8 to w*2. +function prep_neon + adr x10, L(prep_tbl) + ldrh w9, [x10, x9, lsl #1] + dup v31.8h, w7 // intermediate_bits + movi v30.8h, #(PREP_BIAS >> 8), lsl #8 + sub x10, x10, w9, uxtw + br x10 + +40: + AARCH64_VALID_JUMP_TARGET + add x9, x1, x2 + lsl x2, x2, #1 +4: + ld1 {v0.d}[0], [x1], x2 + ld1 {v0.d}[1], [x9], x2 + subs w4, w4, #2 + sshl v0.8h, v0.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + st1 {v0.8h}, [x0], #16 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + add x9, x1, x2 + lsl x2, x2, #1 +8: + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x9], x2 + subs w4, w4, #2 + sshl v0.8h, v0.8h, v31.8h + sshl v1.8h, v1.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x1] + add x1, x1, x2 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1] + add x1, x1, x2 + subs w4, w4, #2 + sshl v1.8h, v1.8h, v31.8h + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x1] + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + add x1, x1, x2 + sshl v1.8h, v1.8h, v31.8h + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + subs w4, w4, #1 + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x1] + subs w4, w4, #1 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + sshl v1.8h, v1.8h, v31.8h + ldp q4, q5, [x1, #64] + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + ldp q6, q7, [x1, #96] + add x1, x1, x2 + sshl v4.8h, v4.8h, v31.8h + sshl v5.8h, v5.8h, v31.8h + sshl v6.8h, v6.8h, v31.8h + sshl v7.8h, v7.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + stp q0, q1, [x0] + sub v4.8h, v4.8h, v30.8h + sub v5.8h, v5.8h, v30.8h + stp q2, q3, [x0, #32] + sub v6.8h, v6.8h, v30.8h + sub v7.8h, v7.8h, v30.8h + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, x8 + b.gt 64b + ret +128: + AARCH64_VALID_JUMP_TARGET + ldp q0, q1, [x1] + subs w4, w4, #1 + sshl v0.8h, v0.8h, v31.8h + ldp q2, q3, [x1, #32] + sshl v1.8h, v1.8h, v31.8h + ldp q4, q5, [x1, #64] + sshl v2.8h, v2.8h, v31.8h + sshl v3.8h, v3.8h, v31.8h + ldp q6, q7, [x1, #96] + sshl v4.8h, v4.8h, v31.8h + sshl v5.8h, v5.8h, v31.8h + ldp q16, q17, [x1, #128] + sshl v6.8h, v6.8h, v31.8h + sshl v7.8h, v7.8h, v31.8h + ldp q18, q19, [x1, #160] + sshl v16.8h, v16.8h, v31.8h + sshl v17.8h, v17.8h, v31.8h + ldp q20, q21, [x1, #192] + sshl v18.8h, v18.8h, v31.8h + sshl v19.8h, v19.8h, v31.8h + ldp q22, q23, [x1, #224] + add x1, x1, x2 + sshl v20.8h, v20.8h, v31.8h + sshl v21.8h, v21.8h, v31.8h + sshl v22.8h, v22.8h, v31.8h + sshl v23.8h, v23.8h, v31.8h + sub v0.8h, v0.8h, v30.8h + sub v1.8h, v1.8h, v30.8h + sub v2.8h, v2.8h, v30.8h + sub v3.8h, v3.8h, v30.8h + stp q0, q1, [x0] + sub v4.8h, v4.8h, v30.8h + sub v5.8h, v5.8h, v30.8h + stp q2, q3, [x0, #32] + sub v6.8h, v6.8h, v30.8h + sub v7.8h, v7.8h, v30.8h + stp q4, q5, [x0, #64] + sub v16.8h, v16.8h, v30.8h + sub v17.8h, v17.8h, v30.8h + stp q6, q7, [x0, #96] + sub v18.8h, v18.8h, v30.8h + sub v19.8h, v19.8h, v30.8h + stp q16, q17, [x0, #128] + sub v20.8h, v20.8h, v30.8h + sub v21.8h, v21.8h, v30.8h + stp q18, q19, [x0, #160] + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x0, x0, x8 + b.gt 128b + ret + +L(prep_tbl): + .hword L(prep_tbl) - 128b + .hword L(prep_tbl) - 64b + .hword L(prep_tbl) - 32b + .hword L(prep_tbl) - 16b + .hword L(prep_tbl) - 80b + .hword L(prep_tbl) - 40b +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}[0], [\s0], \strd + ld1 {\d1\wd}[0], [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}[0], [\s0], \strd + ld1 {\d3\wd}[0], [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}[0], [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}[0], [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}[0], [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}, [\s0], \strd + ld1 {\d1\wd}, [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}, [\s0], \strd + ld1 {\d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}, [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}, [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 + ld1 {\d0\wd, \d1\wd}, [\s0], \strd +.ifnb \d2 + ld1 {\d2\wd, \d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd, \d5\wd}, [\s0], \strd +.endif +.endm +.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1 wd, r0, r1, r2, r3, r4 + trn1 \r0\wd, \r0\wd, \r1\wd + trn1 \r1\wd, \r1\wd, \r2\wd +.ifnb \r3 + trn1 \r2\wd, \r2\wd, \r3\wd + trn1 \r3\wd, \r3\wd, \r4\wd +.endif +.endm +.macro interleave_1_s r0, r1, r2, r3, r4 + interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 +.endm +.macro umin_h c, wd, r0, r1, r2, r3 + umin \r0\wd, \r0\wd, \c\wd +.ifnb \r1 + umin \r1\wd, \r1\wd, \c\wd +.endif +.ifnb \r2 + umin \r2\wd, \r2\wd, \c\wd + umin \r3\wd, \r3\wd, \c\wd +.endif +.endm +.macro sub_h c, wd, r0, r1, r2, r3 + sub \r0\wd, \r0\wd, \c\wd +.ifnb \r1 + sub \r1\wd, \r1\wd, \c\wd +.endif +.ifnb \r2 + sub \r2\wd, \r2\wd, \c\wd + sub \r3\wd, \r3\wd, \c\wd +.endif +.endm +.macro smull_smlal_4 d, s0, s1, s2, s3 + smull \d\().4s, \s0\().4h, v0.h[0] + smlal \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] +.endm +.macro smull2_smlal2_4 d, s0, s1, s2, s3 + smull2 \d\().4s, \s0\().8h, v0.h[0] + smlal2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] +.endm +.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + smull \d\().4s, \s0\().4h, v0.h[0] + smlal \d\().4s, \s1\().4h, v0.h[1] + smlal \d\().4s, \s2\().4h, v0.h[2] + smlal \d\().4s, \s3\().4h, v0.h[3] + smlal \d\().4s, \s4\().4h, v0.h[4] + smlal \d\().4s, \s5\().4h, v0.h[5] + smlal \d\().4s, \s6\().4h, v0.h[6] + smlal \d\().4s, \s7\().4h, v0.h[7] +.endm +.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + smull2 \d\().4s, \s0\().8h, v0.h[0] + smlal2 \d\().4s, \s1\().8h, v0.h[1] + smlal2 \d\().4s, \s2\().8h, v0.h[2] + smlal2 \d\().4s, \s3\().8h, v0.h[3] + smlal2 \d\().4s, \s4\().8h, v0.h[4] + smlal2 \d\().4s, \s5\().8h, v0.h[5] + smlal2 \d\().4s, \s6\().8h, v0.h[6] + smlal2 \d\().4s, \s7\().8h, v0.h[7] +.endm +.macro sqrshrun_h shift, r0, r1, r2, r3 + sqrshrun \r0\().4h, \r0\().4s, #\shift +.ifnb \r1 + sqrshrun2 \r0\().8h, \r1\().4s, #\shift +.endif +.ifnb \r2 + sqrshrun \r2\().4h, \r2\().4s, #\shift + sqrshrun2 \r2\().8h, \r3\().4s, #\shift +.endif +.endm +.macro xtn_h r0, r1, r2, r3 + uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 +.ifnb \r2 + uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto +.endif +.endm +.macro srshl_s shift, r0, r1, r2, r3 + srshl \r0\().4s, \r0\().4s, \shift\().4s + srshl \r1\().4s, \r1\().4s, \shift\().4s +.ifnb \r2 + srshl \r2\().4s, \r2\().4s, \shift\().4s + srshl \r3\().4s, \r3\().4s, \shift\().4s +.endif +.endm +.macro st_s strd, reg, lanes + st1 {\reg\().s}[0], [x0], \strd + st1 {\reg\().s}[1], [x9], \strd +.if \lanes > 2 + st1 {\reg\().s}[2], [x0], \strd + st1 {\reg\().s}[3], [x9], \strd +.endif +.endm +.macro st_d strd, r0, r1 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x9], \strd +.ifnb \r1 + st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().d}[1], [x9], \strd +.endif +.endm +.macro shift_store_4 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin_h v31, .8h, \r0, \r2 +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub_h v29, .8h, \r0, \r2 // PREP_BIAS +.endif + st_d \strd, \r0, \r2 +.endm +.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 + st1 {\r0\wd}, [x0], \strd + st1 {\r1\wd}, [x9], \strd +.ifnb \r2 + st1 {\r2\wd}, [x0], \strd + st1 {\r3\wd}, [x9], \strd +.endif +.ifnb \r4 + st1 {\r4\wd}, [x0], \strd + st1 {\r5\wd}, [x9], \strd + st1 {\r6\wd}, [x0], \strd + st1 {\r7\wd}, [x9], \strd +.endif +.endm +.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro shift_store_8 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin_h v31, .8h, \r0, \r2 +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub_h v29, .8h, \r0, \r2 // PREP_BIAS +.endif + st_8h \strd, \r0, \r2 +.endm +.macro shift_store_16 type, strd, dst, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_h 6, \r0, \r1, \r2, \r3 + umin \r0\().8h, \r0\().8h, v31.8h + umin \r1\().8h, \r2\().8h, v31.8h +.else + srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) + xtn_h \r0, \r1, \r2, \r3 + sub \r0\().8h, \r0\().8h, v29.8h + sub \r1\().8h, \r2\().8h, v29.8h +.endif + st1 {\r0\().8h, \r1\().8h}, [\dst], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + mov w9, \type_h + mov w10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon +.ifc \bdmax, w8 + ldr w8, [sp] +.endif + mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, w11 + mul \my, \my, w11 + add \mx, \mx, w9 // mx, 8tap_h, 4tap_h + add \my, \my, w10 // my, 8tap_v, 4tap_v +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + dup v31.8h, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz w9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + mov w12, #6 + tst \mx, #(0x7f << 14) + sub w9, w9, #24 + add w13, w12, \bdmax // 6 + intermediate_bits + sub w12, w12, \bdmax // 6 - intermediate_bits + movrel x11, X(mc_subpel_filters), -8 + b.ne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + b.ne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx w10, \mx, #7, #7 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w10 +4: + tst \my, #(0x7f << 14) + add \xmx, x11, \mx, uxtw #3 + b.ne L(\type\()_8tap_hv) + + adr x10, L(\type\()_8tap_h_tbl) + dup v30.4s, w12 // 6 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v30.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + dup v29.8h, \bdmax // intermediate_bits +.else + movi v28.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v29.8h, v29.8h // -intermediate_bits +.endif + br x10 + +20: // 2xN h + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +2: + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + subs \h, \h, #2 + trn1 v3.2s, v4.2s, v6.2s + trn2 v6.2s, v4.2s, v6.2s + trn1 v4.2s, v5.2s, v7.2s + trn2 v7.2s, v5.2s, v7.2s + smull v3.4s, v3.4h, v0.h[0] + smlal v3.4s, v4.4h, v0.h[1] + smlal v3.4s, v6.4h, v0.h[2] + smlal v3.4s, v7.4h, v0.h[3] + srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) + sqxtun v3.4h, v3.4s + srshl v3.4h, v3.4h, v29.4h // -intermediate_bits + umin v3.4h, v3.4h, v31.4h + st1 {v3.s}[0], [\dst], \d_strd + st1 {v3.s}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + AARCH64_VALID_JUMP_TARGET + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +4: + ld1 {v16.8h}, [\src], \s_strd + ld1 {v20.8h}, [\sr2], \s_strd + ext v17.16b, v16.16b, v16.16b, #2 + ext v18.16b, v16.16b, v16.16b, #4 + ext v19.16b, v16.16b, v16.16b, #6 + ext v21.16b, v20.16b, v20.16b, #2 + ext v22.16b, v20.16b, v20.16b, #4 + ext v23.16b, v20.16b, v20.16b, #6 + subs \h, \h, #2 + smull v16.4s, v16.4h, v0.h[0] + smlal v16.4s, v17.4h, v0.h[1] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[3] + smull v20.4s, v20.4h, v0.h[0] + smlal v20.4s, v21.4h, v0.h[1] + smlal v20.4s, v22.4h, v0.h[2] + smlal v20.4s, v23.4h, v0.h[3] + srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) + srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + sqxtun v16.4h, v16.4s + sqxtun2 v16.8h, v20.4s + srshl v16.8h, v16.8h, v29.8h // -intermediate_bits + umin v16.8h, v16.8h, v31.8h +.else + uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 + sub v16.8h, v16.8h, v28.8h // PREP_BIAS +.endif + st1 {v16.d}[0], [\dst], \d_strd + st1 {v16.d}[1], [\ds2], \d_strd + b.gt 4b + ret + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmx] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + sub \s_strd, \s_strd, \w, uxtw #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw #1 +.endif +81: + ld1 {v16.8h, v17.8h}, [\src], #32 + ld1 {v20.8h, v21.8h}, [\sr2], #32 + mov \mx, \w + +8: + smull v18.4s, v16.4h, v0.h[0] + smull2 v19.4s, v16.8h, v0.h[0] + smull v22.4s, v20.4h, v0.h[0] + smull2 v23.4s, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v24.16b, v16.16b, v17.16b, #(2*\i) + ext v25.16b, v20.16b, v21.16b, #(2*\i) + smlal v18.4s, v24.4h, v0.h[\i] + smlal2 v19.4s, v24.8h, v0.h[\i] + smlal v22.4s, v25.4h, v0.h[\i] + smlal2 v23.4s, v25.8h, v0.h[\i] +.endr + subs \mx, \mx, #8 + srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) + srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) + srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) + srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + sqxtun v18.4h, v18.4s + sqxtun2 v18.8h, v19.4s + sqxtun v22.4h, v22.4s + sqxtun2 v22.8h, v23.4s + srshl v18.8h, v18.8h, v29.8h // -intermediate_bits + srshl v22.8h, v22.8h, v29.8h // -intermediate_bits + umin v18.8h, v18.8h, v31.8h + umin v22.8h, v22.8h, v31.8h +.else + uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 + uzp1 v22.8h, v22.8h, v23.8h // Ditto + sub v18.8h, v18.8h, v28.8h // PREP_BIAS + sub v22.8h, v22.8h, v28.8h // PREP_BIAS +.endif + st1 {v18.8h}, [\dst], #16 + st1 {v22.8h}, [\ds2], #16 + b.le 9f + + mov v16.16b, v17.16b + mov v20.16b, v21.16b + ld1 {v17.8h}, [\src], #16 + ld1 {v21.8h}, [\sr2], #16 + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 81b + ret + +L(\type\()_8tap_h_tbl): + .hword L(\type\()_8tap_h_tbl) - 1280b + .hword L(\type\()_8tap_h_tbl) - 640b + .hword L(\type\()_8tap_h_tbl) - 320b + .hword L(\type\()_8tap_h_tbl) - 160b + .hword L(\type\()_8tap_h_tbl) - 80b + .hword L(\type\()_8tap_h_tbl) - 40b + .hword L(\type\()_8tap_h_tbl) - 20b + .hword 0 + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx w10, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w10 +4: + add \xmy, x11, \my, uxtw #3 + +.ifc \type, prep + dup v30.4s, w12 // 6 - intermediate_bits + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + adr x10, L(\type\()_8tap_v_tbl) + ldrh w9, [x10, x9, lsl #1] +.ifc \type, prep + neg v30.4s, v30.4s // -(6-intermediate_bits) +.endif + sub x10, x10, w9, uxtw + br x10 + +20: // 2xN v + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + b.gt 28f + + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + // 2x2 v + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_s v1, v2, v3, v4, v5 + b.gt 24f + smull_smlal_4 v6, v1, v2, v3, v4 + sqrshrun_h 6, v6 + umin_h v31, .8h, v6 + st_s \d_strd, v6, 2 + ret + +24: // 2x4 v + load_s \sr2, \src, \s_strd, v6, v7 + interleave_1_s v5, v6, v7 + smull_smlal_4 v16, v1, v2, v3, v4 + smull_smlal_4 v17, v3, v4, v5, v6 + sqrshrun_h 6, v16, v17 + umin_h v31, .8h, v16 + st_s \d_strd, v16, 4 + ret + +28: // 2x6, 2x8, 2x12, 2x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 + interleave_1_s v1, v2, v3, v4, v5 + interleave_1_s v5, v6, v7 +216: + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v16, v17, v18, v19 + interleave_1_s v7, v16, v17, v18, v19 + smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 + sqrshrun_h 6, v24, v25 + umin_h v31, .8h, v24 + st_s \d_strd, v24, 4 + b.le 0f + cmp \h, #2 + mov v1.16b, v5.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + mov v4.16b, v16.16b + mov v5.16b, v17.16b + mov v6.16b, v18.16b + mov v7.16b, v19.16b + b.eq 26f + b 216b +26: + load_s \sr2, \src, \s_strd, v16, v17 + interleave_1_s v7, v16, v17 + smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_h 6, v24 + umin_h v31, .4h, v24 + st_s \d_strd, v24, 2 +0: + ret +.endif + +40: + AARCH64_VALID_JUMP_TARGET + b.gt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + smull_smlal_4 v6, v1, v2, v3, v4 + smull_smlal_4 v7, v2, v3, v4, v5 + shift_store_4 \type, \d_strd, v6, v7 + b.le 0f + load_4h \sr2, \src, \s_strd, v6, v7 + smull_smlal_4 v1, v3, v4, v5, v6 + smull_smlal_4 v2, v4, v5, v6, v7 + shift_store_4 \type, \d_strd, v1, v2 +0: + ret + +480: // 4x6, 4x8, 4x12, 4x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + +48: + subs \h, \h, #4 + load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_4 \type, \d_strd, v1, v2, v3, v4 + b.le 0f + cmp \h, #2 + mov v16.8b, v20.8b + mov v17.8b, v21.8b + mov v18.8b, v22.8b + mov v19.8b, v23.8b + mov v20.8b, v24.8b + mov v21.8b, v25.8b + mov v22.8b, v26.8b + b.eq 46f + b 48b +46: + load_4h \sr2, \src, \s_strd, v23, v24 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_4 \type, \d_strd, v1, v2 +0: + ret + +80: + AARCH64_VALID_JUMP_TARGET + b.gt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + smull_smlal_4 v16, v1, v2, v3, v4 + smull2_smlal2_4 v17, v1, v2, v3, v4 + smull_smlal_4 v18, v2, v3, v4, v5 + smull2_smlal2_4 v19, v2, v3, v4, v5 + shift_store_8 \type, \d_strd, v16, v17, v18, v19 + b.le 0f + load_8h \sr2, \src, \s_strd, v6, v7 + smull_smlal_4 v16, v3, v4, v5, v6 + smull2_smlal2_4 v17, v3, v4, v5, v6 + smull_smlal_4 v18, v4, v5, v6, v7 + smull2_smlal2_4 v19, v4, v5, v6, v7 + shift_store_8 \type, \d_strd, v16, v17, v18, v19 +0: + ret + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmy] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + +88: + subs \h, \h, #2 + load_8h \sr2, \src, \s_strd, v23, v24 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 + smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.le 9f + subs \h, \h, #2 + load_8h \sr2, \src, \s_strd, v25, v26 + smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 + smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 + smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 + smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.le 9f + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + ret + +160: + AARCH64_VALID_JUMP_TARGET + b.gt 1680b + + // 16x2, 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + + load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 +16: + load_16h \src, \src, \s_strd, v22, v23 + subs \h, \h, #1 + smull_smlal_4 v1, v16, v18, v20, v22 + smull2_smlal2_4 v2, v16, v18, v20, v22 + smull_smlal_4 v3, v17, v19, v21, v23 + smull2_smlal2_4 v4, v17, v19, v21, v23 + shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 + b.le 0f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v23.16b + b 16b +0: + ret + +L(\type\()_8tap_v_tbl): + .hword L(\type\()_8tap_v_tbl) - 1280b + .hword L(\type\()_8tap_v_tbl) - 640b + .hword L(\type\()_8tap_v_tbl) - 320b + .hword L(\type\()_8tap_v_tbl) - 160b + .hword L(\type\()_8tap_v_tbl) - 80b + .hword L(\type\()_8tap_v_tbl) - 40b + .hword L(\type\()_8tap_v_tbl) - 20b + .hword 0 + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx w10, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w10 +4: + add \xmy, x11, \my, uxtw #3 + + adr x10, L(\type\()_8tap_hv_tbl) + dup v30.4s, w12 // 6 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v30.4s, v30.4s // -(6-intermediate_bits) +.ifc \type, put + dup v29.4s, w13 // 6 + intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v29.4s, v29.4s // -(6+intermediate_bits) +.endif + br x10 + +20: + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 280f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v27.8h}, [\src], \s_strd + ext v28.16b, v27.16b, v27.16b, #2 + smull v27.4s, v27.4h, v0.4h + smull v28.4s, v28.4h, v0.4h + addp v27.4s, v27.4s, v28.4s + addp v16.4s, v27.4s, v27.4s + srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + bl L(\type\()_8tap_filter_2) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b + +2: + bl L(\type\()_8tap_filter_2) + + ext v18.8b, v17.8b, v24.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + umin v2.4h, v2.4h, v31.4h + subs \h, \h, #2 + st1 {v2.s}[0], [\dst], \d_strd + st1 {v2.s}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v24.8b + b 2b + +280: // 2x8, 2x16, 2x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v27.8h}, [\src], \s_strd + ext v28.16b, v27.16b, v27.16b, #2 + smull v27.4s, v27.4h, v0.4h + smull v28.4s, v28.4h, v0.4h + addp v27.4s, v27.4s, v28.4s + addp v16.4s, v27.4s, v27.4s + srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + + bl L(\type\()_8tap_filter_2) + xtn v16.4h, v16.4s + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b + bl L(\type\()_8tap_filter_2) + ext v18.8b, v17.8b, v24.8b, #4 + mov v19.8b, v24.8b + bl L(\type\()_8tap_filter_2) + ext v20.8b, v19.8b, v24.8b, #4 + mov v21.8b, v24.8b + +28: + bl L(\type\()_8tap_filter_2) + ext v22.8b, v21.8b, v24.8b, #4 + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] + + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + sqxtun v3.4h, v3.4s + umin v3.4h, v3.4h, v31.4h + subs \h, \h, #2 + st1 {v3.s}[0], [\dst], \d_strd + st1 {v3.s}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b + b 28b + +0: + ret x15 + +L(\type\()_8tap_filter_2): + ld1 {v25.8h}, [\sr2], \s_strd + ld1 {v27.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v28.16b, v27.16b, v27.16b, #2 + trn1 v24.2s, v25.2s, v27.2s + trn2 v27.2s, v25.2s, v27.2s + trn1 v25.2s, v26.2s, v28.2s + trn2 v28.2s, v26.2s, v28.2s + smull v24.4s, v24.4h, v0.h[0] + smlal v24.4s, v25.4h, v0.h[1] + smlal v24.4s, v27.4h, v0.h[2] + smlal v24.4s, v28.4h, v0.h[3] + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s + ret +.endif + +40: + AARCH64_VALID_JUMP_TARGET + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 480f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + // 4x2, 4x4 hv + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v24.8b + mov v18.8b, v25.8b + +4: + bl L(\type\()_8tap_filter_4) + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + smull v3.4s, v17.4h, v1.h[0] + smlal v3.4s, v18.4h, v1.h[1] + smlal v3.4s, v24.4h, v1.h[2] + smlal v3.4s, v25.4h, v1.h[3] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + umin v2.8h, v2.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + + st1 {v2.d}[0], [\dst], \d_strd + st1 {v2.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v24.8b + mov v18.8b, v25.8b + b 4b + +480: // 4x8, 4x16, 4x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v24.8b + mov v18.8b, v25.8b + bl L(\type\()_8tap_filter_4) + mov v19.8b, v24.8b + mov v20.8b, v25.8b + bl L(\type\()_8tap_filter_4) + mov v21.8b, v24.8b + mov v22.8b, v25.8b + +48: + bl L(\type\()_8tap_filter_4) + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] + smull v4.4s, v17.4h, v1.h[0] + smlal v4.4s, v18.4h, v1.h[1] + smlal v4.4s, v19.4h, v1.h[2] + smlal v4.4s, v20.4h, v1.h[3] + smlal v4.4s, v21.4h, v1.h[4] + smlal v4.4s, v22.4h, v1.h[5] + smlal v4.4s, v24.4h, v1.h[6] + smlal v4.4s, v25.4h, v1.h[7] +.ifc \type, put + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + sqxtun v3.4h, v3.4s + sqxtun2 v3.8h, v4.4s + umin v3.8h, v3.8h, v31.8h +.else + rshrn v3.4h, v3.4s, #6 + rshrn2 v3.8h, v4.4s, #6 + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v3.d}[0], [\dst], \d_strd + st1 {v3.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b + mov v22.8b, v25.8b + b 48b +0: + ret x15 + +L(\type\()_8tap_filter_4): + ld1 {v24.8h}, [\sr2], \s_strd + ld1 {v25.8h}, [\src], \s_strd + ext v26.16b, v24.16b, v24.16b, #2 + ext v27.16b, v24.16b, v24.16b, #4 + ext v28.16b, v24.16b, v24.16b, #6 + smull v24.4s, v24.4h, v0.h[0] + smlal v24.4s, v26.4h, v0.h[1] + smlal v24.4s, v27.4h, v0.h[2] + smlal v24.4s, v28.4h, v0.h[3] + ext v26.16b, v25.16b, v25.16b, #2 + ext v27.16b, v25.16b, v25.16b, #4 + ext v28.16b, v25.16b, v25.16b, #6 + smull v25.4s, v25.4h, v0.h[0] + smlal v25.4s, v26.4h, v0.h[1] + smlal v25.4s, v27.4h, v0.h[2] + smlal v25.4s, v28.4h, v0.h[3] + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s + xtn v25.4h, v25.4s + ret + +80: +160: +320: + AARCH64_VALID_JUMP_TARGET + b.gt 880f + add \xmy, \xmy, #2 + ld1 {v0.8b}, [\xmx] + ld1 {v1.s}[0], [\xmy] + sub \src, \src, #6 + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v27.8h, v28.8h}, [\src], \s_strd + smull v24.4s, v27.4h, v0.h[0] + smull2 v25.4s, v27.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53), + // and conserves register space (no need to clobber v8-v15). + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v23.16b + mov v18.16b, v24.16b + +8: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v23.4h, v1.h[2] + smlal2 v5.4s, v23.8h, v1.h[2] + smlal v2.4s, v23.4h, v1.h[3] + smlal2 v3.4s, v23.8h, v1.h[3] + smlal v4.4s, v24.4h, v1.h[3] + smlal2 v5.4s, v24.8h, v1.h[3] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v2.8h}, [\dst], \d_strd + st1 {v3.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + b 8b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8b}, [\xmx] + ld1 {v1.8b}, [\xmy] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v27.8h, v28.8h}, [\src], \s_strd + smull v24.4s, v27.4h, v0.h[0] + smull2 v25.4s, v27.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v27.16b, v28.16b, #(2*\i) + smlal v24.4s, v26.4h, v0.h[\i] + smlal2 v25.4s, v26.8h, v0.h[\i] +.endr + srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53), + // and conserves register space (no need to clobber v8-v15). + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v23.16b + mov v18.16b, v24.16b + bl L(\type\()_8tap_filter_8) + mov v19.16b, v23.16b + mov v20.16b, v24.16b + bl L(\type\()_8tap_filter_8) + mov v21.16b, v23.16b + mov v22.16b, v24.16b + +88: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v19.4h, v1.h[2] + smlal2 v5.4s, v19.8h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal v4.4s, v20.4h, v1.h[3] + smlal2 v5.4s, v20.8h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal2 v3.4s, v20.8h, v1.h[4] + smlal v4.4s, v21.4h, v1.h[4] + smlal2 v5.4s, v21.8h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal2 v3.4s, v21.8h, v1.h[5] + smlal v4.4s, v22.4h, v1.h[5] + smlal2 v5.4s, v22.8h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal2 v3.4s, v22.8h, v1.h[6] + smlal v4.4s, v23.4h, v1.h[6] + smlal2 v5.4s, v23.8h, v1.h[6] + smlal v2.4s, v23.4h, v1.h[7] + smlal2 v3.4s, v23.8h, v1.h[7] + smlal v4.4s, v24.4h, v1.h[7] + smlal2 v5.4s, v24.8h, v1.h[7] +.ifc \type, put + srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) + srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) + srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) + srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h +.else + rshrn v2.4h, v2.4s, #6 + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + sub v2.8h, v2.8h, v29.8h // PREP_BIAS + sub v3.8h, v3.8h, v29.8h // PREP_BIAS +.endif + subs \h, \h, #2 + st1 {v2.8h}, [\dst], \d_strd + st1 {v3.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v23.16b + mov v22.16b, v24.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + ret x15 + +L(\type\()_8tap_filter_8): + ld1 {v4.8h, v5.8h}, [\sr2], \s_strd + ld1 {v6.8h, v7.8h}, [\src], \s_strd + smull v25.4s, v4.4h, v0.h[0] + smull2 v26.4s, v4.8h, v0.h[0] + smull v27.4s, v6.4h, v0.h[0] + smull2 v28.4s, v6.8h, v0.h[0] +.irpc i, 1234567 + ext v23.16b, v4.16b, v5.16b, #(2*\i) + ext v24.16b, v6.16b, v7.16b, #(2*\i) + smlal v25.4s, v23.4h, v0.h[\i] + smlal2 v26.4s, v23.8h, v0.h[\i] + smlal v27.4s, v24.4h, v0.h[\i] + smlal2 v28.4s, v24.8h, v0.h[\i] +.endr + srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) + srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) + srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) + srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) + uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 + uzp1 v24.8h, v27.8h, v28.8h // Ditto + ret + +L(\type\()_8tap_hv_tbl): + .hword L(\type\()_8tap_hv_tbl) - 1280b + .hword L(\type\()_8tap_hv_tbl) - 640b + .hword L(\type\()_8tap_hv_tbl) - 320b + .hword L(\type\()_8tap_hv_tbl) - 160b + .hword L(\type\()_8tap_hv_tbl) - 80b + .hword L(\type\()_8tap_hv_tbl) - 40b + .hword L(\type\()_8tap_hv_tbl) - 20b + .hword 0 +endfunc + + +function \type\()_bilin_16bpc_neon, export=1 +.ifc \bdmax, w8 + ldr w8, [sp] +.endif + dup v1.8h, \mx + dup v3.8h, \my + mov w10, #16 + sub w9, w10, \mx + sub w10, w10, \my + dup v0.8h, w9 + dup v2.8h, w10 +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz \bdmax, \bdmax // bitdepth_max + clz w9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + mov w11, #4 + sub w9, w9, #24 + sub w11, w11, \bdmax // 4 - intermediate_bits + add w12, \bdmax, #4 // 4 + intermediate_bits + cbnz \mx, L(\type\()_bilin_h) + cbnz \my, L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cbnz \my, L(\type\()_bilin_hv) + + adr x10, L(\type\()_bilin_h_tbl) + dup v31.8h, w11 // 4 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v31.8h, v31.8h // -(4-intermediate_bits) +.ifc \type, put + dup v30.8h, \bdmax // intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v30.8h, v30.8h // -intermediate_bits +.endif + br x10 + +20: // 2xN h + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + ld1 {v4.4h}, [\src], \s_strd + ld1 {v6.4h}, [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #2 + ext v7.8b, v6.8b, v6.8b, #2 + trn1 v4.2s, v4.2s, v6.2s + trn1 v5.2s, v5.2s, v7.2s + subs \h, \h, #2 + mul v4.4h, v4.4h, v0.4h + mla v4.4h, v5.4h, v1.4h + urshl v4.4h, v4.4h, v31.4h + urshl v4.4h, v4.4h, v30.4h + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + trn1 v4.2d, v4.2d, v6.2d + trn1 v5.2d, v5.2d, v7.2d + subs \h, \h, #2 + mul v4.8h, v4.8h, v0.8h + mla v4.8h, v5.8h, v1.8h + urshl v4.8h, v4.8h, v31.8h +.ifc \type, put + urshl v4.8h, v4.8h, v30.8h +.else + sub v4.8h, v4.8h, v29.8h +.endif + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.gt 4b + ret + +80: // 8xN h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + ldr h5, [\src, #16] + ldr h7, [\sr2, #16] + ld1 {v4.8h}, [\src], \s_strd + ld1 {v6.8h}, [\sr2], \s_strd + ext v5.16b, v4.16b, v5.16b, #2 + ext v7.16b, v6.16b, v7.16b, #2 + subs \h, \h, #2 + mul v4.8h, v4.8h, v0.8h + mla v4.8h, v5.8h, v1.8h + mul v6.8h, v6.8h, v0.8h + mla v6.8h, v7.8h, v1.8h + urshl v4.8h, v4.8h, v31.8h + urshl v6.8h, v6.8h, v31.8h +.ifc \type, put + urshl v4.8h, v4.8h, v30.8h + urshl v6.8h, v6.8h, v30.8h +.else + sub v4.8h, v4.8h, v29.8h + sub v6.8h, v6.8h, v29.8h +.endif + st1 {v4.8h}, [\dst], \d_strd + st1 {v6.8h}, [\ds2], \d_strd + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, uxtw #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw #1 +.endif +161: + ld1 {v16.8h}, [\src], #16 + ld1 {v21.8h}, [\sr2], #16 + mov \mx, \w + +16: + ld1 {v17.8h, v18.8h}, [\src], #32 + ld1 {v22.8h, v23.8h}, [\sr2], #32 + ext v19.16b, v16.16b, v17.16b, #2 + ext v20.16b, v17.16b, v18.16b, #2 + ext v24.16b, v21.16b, v22.16b, #2 + ext v25.16b, v22.16b, v23.16b, #2 + mul v16.8h, v16.8h, v0.8h + mla v16.8h, v19.8h, v1.8h + mul v17.8h, v17.8h, v0.8h + mla v17.8h, v20.8h, v1.8h + mul v21.8h, v21.8h, v0.8h + mla v21.8h, v24.8h, v1.8h + mul v22.8h, v22.8h, v0.8h + mla v22.8h, v25.8h, v1.8h + urshl v16.8h, v16.8h, v31.8h + urshl v17.8h, v17.8h, v31.8h + urshl v21.8h, v21.8h, v31.8h + urshl v22.8h, v22.8h, v31.8h + subs \mx, \mx, #16 +.ifc \type, put + urshl v16.8h, v16.8h, v30.8h + urshl v17.8h, v17.8h, v30.8h + urshl v21.8h, v21.8h, v30.8h + urshl v22.8h, v22.8h, v30.8h +.else + sub v16.8h, v16.8h, v29.8h + sub v17.8h, v17.8h, v29.8h + sub v21.8h, v21.8h, v29.8h + sub v22.8h, v22.8h, v29.8h +.endif + st1 {v16.8h, v17.8h}, [\dst], #32 + st1 {v21.8h, v22.8h}, [\ds2], #32 + b.le 9f + + mov v16.16b, v18.16b + mov v21.16b, v23.16b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_bilin_h_tbl): + .hword L(\type\()_bilin_h_tbl) - 1280b + .hword L(\type\()_bilin_h_tbl) - 640b + .hword L(\type\()_bilin_h_tbl) - 320b + .hword L(\type\()_bilin_h_tbl) - 160b + .hword L(\type\()_bilin_h_tbl) - 80b + .hword L(\type\()_bilin_h_tbl) - 40b + .hword L(\type\()_bilin_h_tbl) - 20b + .hword 0 + + +L(\type\()_bilin_v): + cmp \h, #4 + adr x10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + dup v31.8h, w11 // 4 - intermediate_bits +.endif + ldrh w9, [x10, x9, lsl #1] +.ifc \type, prep + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 + neg v31.8h, v31.8h // -(4-intermediate_bits) +.endif + sub x10, x10, w9, uxtw + br x10 + +20: // 2xN v + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + ld1 {v16.s}[0], [\src], \s_strd + b.gt 24f +22: + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + mul v4.4h, v16.4h, v2.4h + mla v4.4h, v17.4h, v3.4h + urshr v4.8h, v4.8h, #4 + st1 {v4.s}[0], [\dst] + st1 {v4.s}[1], [\ds2] + ret +24: // 2x4, 2x6, 2x8, ... v + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + ld1 {v19.s}[0], [\sr2], \s_strd + ld1 {v20.s}[0], [\src], \s_strd + sub \h, \h, #4 + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + trn1 v18.2s, v18.2s, v19.2s + trn1 v19.2s, v19.2s, v20.2s + trn1 v16.2d, v16.2d, v18.2d + trn1 v17.2d, v17.2d, v19.2d + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + cmp \h, #2 + urshr v4.8h, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + st1 {v4.s}[2], [\dst], \d_strd + st1 {v4.s}[3], [\ds2], \d_strd + b.lt 0f + mov v16.8b, v20.8b + b.eq 22b + b 24b +0: + ret +.endif + +40: // 4xN v + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.4h}, [\src], \s_strd +4: + ld1 {v17.4h}, [\sr2], \s_strd + ld1 {v18.4h}, [\src], \s_strd + trn1 v16.2d, v16.2d, v17.2d + trn1 v17.2d, v17.2d, v18.2d + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + sub v4.8h, v4.8h, v29.8h +.endif + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + b 4b +0: + ret + +80: // 8xN v + AARCH64_VALID_JUMP_TARGET + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.8h}, [\src], \s_strd +8: + ld1 {v17.8h}, [\sr2], \s_strd + ld1 {v18.8h}, [\src], \s_strd + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v18.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + urshl v5.8h, v5.8h, v31.8h + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h +.endif + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd + b.le 0f + mov v16.16b, v18.16b + b 8b +0: + ret + +160: // 16xN, 32xN, ... +320: +640: +1280: + AARCH64_VALID_JUMP_TARGET + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v16.8h, v17.8h}, [\src], \s_strd +2: + ld1 {v18.8h, v19.8h}, [\sr2], \s_strd + ld1 {v20.8h, v21.8h}, [\src], \s_strd + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v18.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v19.8h, v3.8h + mul v6.8h, v18.8h, v2.8h + mla v6.8h, v20.8h, v3.8h + mul v7.8h, v19.8h, v2.8h + mla v7.8h, v21.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 + urshr v6.8h, v6.8h, #4 + urshr v7.8h, v7.8h, #4 +.else + urshl v4.8h, v4.8h, v31.8h + urshl v5.8h, v5.8h, v31.8h + urshl v6.8h, v6.8h, v31.8h + urshl v7.8h, v7.8h, v31.8h + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h + sub v6.8h, v6.8h, v29.8h + sub v7.8h, v7.8h, v29.8h +.endif + st1 {v4.8h, v5.8h}, [\dst], \d_strd + st1 {v6.8h, v7.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b 2b +9: + subs \w, \w, #16 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + ret + +L(\type\()_bilin_v_tbl): + .hword L(\type\()_bilin_v_tbl) - 1280b + .hword L(\type\()_bilin_v_tbl) - 640b + .hword L(\type\()_bilin_v_tbl) - 320b + .hword L(\type\()_bilin_v_tbl) - 160b + .hword L(\type\()_bilin_v_tbl) - 80b + .hword L(\type\()_bilin_v_tbl) - 40b + .hword L(\type\()_bilin_v_tbl) - 20b + .hword 0 + +L(\type\()_bilin_hv): + adr x10, L(\type\()_bilin_hv_tbl) + dup v31.8h, w11 // 4 - intermediate_bits + ldrh w9, [x10, x9, lsl #1] + neg v31.8h, v31.8h // -(4-intermediate_bits) +.ifc \type, put + dup v30.4s, w12 // 4 + intermediate_bits +.else + movi v29.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + sub x10, x10, w9, uxtw +.ifc \type, put + neg v30.4s, v30.4s // -(4+intermediate_bits) +.endif + br x10 + +20: // 2xN hv + AARCH64_VALID_JUMP_TARGET +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v20.4h}, [\src], \s_strd + ext v21.8b, v20.8b, v20.8b, #2 + mul v16.4h, v20.4h, v0.4h + mla v16.4h, v21.4h, v1.4h + urshl v16.4h, v16.4h, v31.4h + +2: + ld1 {v22.4h}, [\sr2], \s_strd + ld1 {v24.4h}, [\src], \s_strd + ext v23.8b, v22.8b, v22.8b, #2 + ext v25.8b, v24.8b, v24.8b, #2 + trn1 v22.2s, v22.2s, v24.2s + trn1 v23.2s, v23.2s, v25.2s + mul v17.4h, v22.4h, v0.4h + mla v17.4h, v23.4h, v1.4h + urshl v17.4h, v17.4h, v31.4h + + trn1 v16.2s, v16.2s, v17.2s + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + urshl v4.4s, v4.4s, v30.4s + xtn v4.4h, v4.4s + subs \h, \h, #2 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2s, v17.2s, v17.2s + b 2b +0: + ret +.endif + +40: // 4xN hv + AARCH64_VALID_JUMP_TARGET + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v20.8h}, [\src], \s_strd + ext v21.16b, v20.16b, v20.16b, #2 + mul v16.4h, v20.4h, v0.4h + mla v16.4h, v21.4h, v1.4h + urshl v16.4h, v16.4h, v31.4h + +4: + ld1 {v22.8h}, [\sr2], \s_strd + ld1 {v24.8h}, [\src], \s_strd + ext v23.16b, v22.16b, v22.16b, #2 + ext v25.16b, v24.16b, v24.16b, #2 + trn1 v22.2d, v22.2d, v24.2d + trn1 v23.2d, v23.2d, v25.2d + mul v17.8h, v22.8h, v0.8h + mla v17.8h, v23.8h, v1.8h + urshl v17.8h, v17.8h, v31.8h + + trn1 v16.2d, v16.2d, v17.2d + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + umull2 v5.4s, v16.8h, v2.8h + umlal2 v5.4s, v17.8h, v3.8h +.ifc \type, put + urshl v4.4s, v4.4s, v30.4s + urshl v5.4s, v5.4s, v30.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 +.else + rshrn v4.4h, v4.4s, #4 + rshrn2 v4.8h, v5.4s, #4 + sub v4.8h, v4.8h, v29.8h +.endif + subs \h, \h, #2 + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2d, v17.2d, v17.2d + b 4b +0: + ret + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + AARCH64_VALID_JUMP_TARGET + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ldr h21, [\src, #16] + ld1 {v20.8h}, [\src], \s_strd + ext v21.16b, v20.16b, v21.16b, #2 + mul v16.8h, v20.8h, v0.8h + mla v16.8h, v21.8h, v1.8h + urshl v16.8h, v16.8h, v31.8h + +2: + ldr h23, [\sr2, #16] + ld1 {v22.8h}, [\sr2], \s_strd + ldr h25, [\src, #16] + ld1 {v24.8h}, [\src], \s_strd + ext v23.16b, v22.16b, v23.16b, #2 + ext v25.16b, v24.16b, v25.16b, #2 + mul v17.8h, v22.8h, v0.8h + mla v17.8h, v23.8h, v1.8h + mul v18.8h, v24.8h, v0.8h + mla v18.8h, v25.8h, v1.8h + urshl v17.8h, v17.8h, v31.8h + urshl v18.8h, v18.8h, v31.8h + + umull v4.4s, v16.4h, v2.4h + umlal v4.4s, v17.4h, v3.4h + umull2 v5.4s, v16.8h, v2.8h + umlal2 v5.4s, v17.8h, v3.8h + umull v6.4s, v17.4h, v2.4h + umlal v6.4s, v18.4h, v3.4h + umull2 v7.4s, v17.8h, v2.8h + umlal2 v7.4s, v18.8h, v3.8h +.ifc \type, put + urshl v4.4s, v4.4s, v30.4s + urshl v5.4s, v5.4s, v30.4s + urshl v6.4s, v6.4s, v30.4s + urshl v7.4s, v7.4s, v30.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 + uzp1 v5.8h, v6.8h, v7.8h // Ditto +.else + rshrn v4.4h, v4.4s, #4 + rshrn2 v4.8h, v5.4s, #4 + rshrn v5.4h, v6.4s, #4 + rshrn2 v5.8h, v7.4s, #4 + sub v4.8h, v4.8h, v29.8h + sub v5.8h, v5.8h, v29.8h +.endif + subs \h, \h, #2 + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + ret + +L(\type\()_bilin_hv_tbl): + .hword L(\type\()_bilin_hv_tbl) - 1280b + .hword L(\type\()_bilin_hv_tbl) - 640b + .hword L(\type\()_bilin_hv_tbl) - 320b + .hword L(\type\()_bilin_hv_tbl) - 160b + .hword L(\type\()_bilin_hv_tbl) - 80b + .hword L(\type\()_bilin_hv_tbl) - 40b + .hword L(\type\()_bilin_hv_tbl) - 20b + .hword 0 +endfunc +.endm + +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 +filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 + +.macro load_filter_row dst, src, inc + asr w13, \src, #10 + add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] +.endm + +function warp_filter_horz_neon + add w12, w5, #512 + + ld1 {v16.8h, v17.8h}, [x2], x3 + + load_filter_row d0, w12, w7 + load_filter_row d1, w12, w7 + load_filter_row d2, w12, w7 + sxtl v0.8h, v0.8b + load_filter_row d3, w12, w7 + sxtl v1.8h, v1.8b + load_filter_row d4, w12, w7 + sxtl v2.8h, v2.8b + load_filter_row d5, w12, w7 + sxtl v3.8h, v3.8b + load_filter_row d6, w12, w7 + sxtl v4.8h, v4.8b + load_filter_row d7, w12, w7 + sxtl v5.8h, v5.8b + ext v18.16b, v16.16b, v17.16b, #2*1 + smull v8.4s, v16.4h, v0.4h + smull2 v9.4s, v16.8h, v0.8h + sxtl v6.8h, v6.8b + ext v19.16b, v16.16b, v17.16b, #2*2 + smull v10.4s, v18.4h, v1.4h + smull2 v11.4s, v18.8h, v1.8h + sxtl v7.8h, v7.8b + ext v20.16b, v16.16b, v17.16b, #2*3 + smull v0.4s, v19.4h, v2.4h + smull2 v1.4s, v19.8h, v2.8h + ext v21.16b, v16.16b, v17.16b, #2*4 + addp v8.4s, v8.4s, v9.4s + smull v2.4s, v20.4h, v3.4h + smull2 v3.4s, v20.8h, v3.8h + ext v22.16b, v16.16b, v17.16b, #2*5 + addp v9.4s, v10.4s, v11.4s + smull v10.4s, v21.4h, v4.4h + smull2 v11.4s, v21.8h, v4.8h + ext v23.16b, v16.16b, v17.16b, #2*6 + addp v0.4s, v0.4s, v1.4s + smull v18.4s, v22.4h, v5.4h + smull2 v19.4s, v22.8h, v5.8h + ext v16.16b, v16.16b, v17.16b, #2*7 + addp v1.4s, v2.4s, v3.4s + addp v2.4s, v10.4s, v11.4s + smull v20.4s, v23.4h, v6.4h + smull2 v21.4s, v23.8h, v6.8h + addp v3.4s, v18.4s, v19.4s + smull v22.4s, v16.4h, v7.4h + smull2 v23.4s, v16.8h, v7.8h + addp v4.4s, v20.4s, v21.4s + addp v5.4s, v22.4s, v23.4s + + addp v8.4s, v8.4s, v9.4s + addp v0.4s, v0.4s, v1.4s + addp v2.4s, v2.4s, v3.4s + addp v4.4s, v4.4s, v5.4s + + addp v16.4s, v8.4s, v0.4s + addp v17.4s, v2.4s, v4.4s + + add w5, w5, w8 + + srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) + srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) + + ret +endfunc + +// void dav1d_warp_affine_8x8_16bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my, +// const int bitdepth_max) +.macro warp t +function warp_affine_8x8\t\()_16bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + +.ifb \t + dup v15.8h, w7 // bitdepth_max +.else + movi v15.8h, #(PREP_BIAS >> 8), lsl #8 +.endif + clz w7, w7 + // intermediate_bits = clz(bitdepth_max) - 18 +.ifb \t + sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 +.endif + sub w7, w7, #25 // -(7 - intermediate_bits) +.ifb \t + neg w8, w8 // -(7 + intermediate_bits) +.endif + dup v14.4s, w7 // -(7 - intermediate_bits) +.ifb \t + dup v13.4s, w8 // -(7 + intermediate_bits) +.endif + + ldr x4, [x4] + sbfx x7, x4, #0, #16 + sbfx x8, x4, #16, #16 + sbfx x9, x4, #32, #16 + sbfx x4, x4, #48, #16 + mov w10, #8 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + sub x2, x2, #6 + movrel x11, X(mc_warp_filter), 64*8 + mov x15, x30 +.ifnb \t + lsl x1, x1, #1 +.endif + + bl warp_filter_horz_neon + uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 + bl warp_filter_horz_neon + uzp1 v25.8h, v16.8h, v17.8h // Ditto + bl warp_filter_horz_neon + uzp1 v26.8h, v16.8h, v17.8h // Ditto + bl warp_filter_horz_neon + uzp1 v27.8h, v16.8h, v17.8h // Ditto + bl warp_filter_horz_neon + uzp1 v28.8h, v16.8h, v17.8h // Ditto + bl warp_filter_horz_neon + uzp1 v29.8h, v16.8h, v17.8h // Ditto + bl warp_filter_horz_neon + uzp1 v30.8h, v16.8h, v17.8h // Ditto + +1: + add w14, w6, #512 + bl warp_filter_horz_neon + uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 + + load_filter_row d0, w14, w9 + load_filter_row d1, w14, w9 + load_filter_row d2, w14, w9 + load_filter_row d3, w14, w9 + load_filter_row d4, w14, w9 + load_filter_row d5, w14, w9 + load_filter_row d6, w14, w9 + load_filter_row d7, w14, w9 + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl + + // This ordering of smull/smlal/smull2/smlal2 is highly + // beneficial for Cortex A53 here. + smull v16.4s, v24.4h, v0.4h + smlal v16.4s, v25.4h, v1.4h + smlal v16.4s, v26.4h, v2.4h + smlal v16.4s, v27.4h, v3.4h + smlal v16.4s, v28.4h, v4.4h + smlal v16.4s, v29.4h, v5.4h + smlal v16.4s, v30.4h, v6.4h + smlal v16.4s, v31.4h, v7.4h + smull2 v17.4s, v24.8h, v0.8h + smlal2 v17.4s, v25.8h, v1.8h + smlal2 v17.4s, v26.8h, v2.8h + smlal2 v17.4s, v27.8h, v3.8h + smlal2 v17.4s, v28.8h, v4.8h + smlal2 v17.4s, v29.8h, v5.8h + smlal2 v17.4s, v30.8h, v6.8h + smlal2 v17.4s, v31.8h, v7.8h + + mov v24.16b, v25.16b + mov v25.16b, v26.16b +.ifb \t + srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) + srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) +.else + rshrn v16.4h, v16.4s, #7 + rshrn2 v16.8h, v17.4s, #7 +.endif + mov v26.16b, v27.16b +.ifb \t + sqxtun v16.4h, v16.4s + sqxtun2 v16.8h, v17.4s +.else + sub v16.8h, v16.8h, v15.8h // PREP_BIAS +.endif + mov v27.16b, v28.16b + mov v28.16b, v29.16b +.ifb \t + umin v16.8h, v16.8h, v15.8h // bitdepth_max +.endif + mov v29.16b, v30.16b + mov v30.16b, v31.16b + subs w10, w10, #1 + st1 {v16.8h}, [x0], x1 + + add w6, w6, w4 + b.gt 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + + ret x15 +endfunc +.endm + +warp +warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.8h}, [x8] + mov x12, x6 // out = dst + mov x3, x4 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4, lsl #1 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 + subs x3, x3, #32 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 + b.gt 1b +.if \need_right + add x3, x8, x2, lsl #1 // in + center_w + sub x3, x3, #2 // in + center_w - 1 + add x12, x6, x4, lsl #1 // dst + left_ext + ld1r {v0.8h}, [x3] + add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w + mov x3, x11 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S new file mode 100644 index 0000000000..3a6cf900a9 --- /dev/null +++ b/third_party/dav1d/src/arm/64/msac.S @@ -0,0 +1,480 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 8 +#define DIF 16 +#define RNG 24 +#define CNT 28 +#define ALLOW_UPDATE_CDF 32 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro ld1_n d0, d1, src, sz, n +.if \n <= 8 + ld1 {\d0\sz}, [\src] +.else + ld1 {\d0\sz, \d1\sz}, [\src] +.endif +.endm + +.macro st1_n s0, s1, dst, sz, n +.if \n <= 8 + st1 {\s0\sz}, [\dst] +.else + st1 {\s0\sz, \s1\sz}, [\dst] +.endif +.endm + +.macro ushr_n d0, d1, s0, s1, shift, sz, n + ushr \d0\sz, \s0\sz, \shift +.if \n == 16 + ushr \d1\sz, \s1\sz, \shift +.endif +.endm + +.macro add_n d0, d1, s0, s1, s2, s3, sz, n + add \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + add \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sub_n d0, d1, s0, s1, s2, s3, sz, n + sub \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sub \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro and_n d0, d1, s0, s1, s2, s3, sz, n + and \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + and \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n + cmhs \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + cmhs \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n + urhadd \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + urhadd \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n + sshl \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sshl \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n + sqdmulh \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sqdmulh \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro str_n idx0, idx1, dstreg, dstoff, n + str \idx0, [\dstreg, \dstoff] +.if \n == 16 + str \idx1, [\dstreg, \dstoff + 16] +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update sz, szb, n + sub sp, sp, #48 + add x8, x0, #RNG + ld1_n v0, v1, x1, \sz, \n // cdf + ld1r {v4\sz}, [x8] // rng + movrel x9, coeffs, 30 + movi v31\sz, #0x7f, lsl #8 // 0x7f00 + sub x9, x9, x2, lsl #1 + mvni v30\sz, #0x3f // 0xffc0 + and v7\szb, v4\szb, v31\szb // rng & 0x7f00 + str h4, [sp, #14] // store original u = s->rng + and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 + + ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) + sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add x8, x0, #DIF + 6 + + add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) + movrel x8, bits + str_n q4, q5, sp, #16, \n // store v values to allow indexed access + + ld1_n v16, v17, x8, .8h, \n + + cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v + + and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask +.if \n == 16 + add v6.8h, v6.8h, v7.8h +.endif + addv h6, v6.8h // Aggregate mask bits + ldr w4, [x0, #ALLOW_UPDATE_CDF] + umov w3, v6.h[0] + rbit w3, w3 + clz w15, w3 // ret + + cbz w4, L(renorm) + // update_cdf + ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] + movi v5\szb, #0xff +.if \n == 16 + mov w4, #-5 +.else + mvn w14, w2 + mov w4, #-4 + cmn w14, #3 // set C if n_symbols <= 2 +.endif + urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) +.else + lsr w14, w3, #4 // count >> 4 + sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6\sz, w4 // -rate + + sub w3, w3, w3, lsr #5 // count - (count == 32) + sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) + sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate + add w3, w3, #1 // count + (count < 32) + add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate + st1_n v0, v1, x1, \sz, \n + strh w3, [x1, x2, lsl #1] +.endm + + decode_update .4h, .8b, 4 + +L(renorm): + add x8, sp, #16 + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) +L(renorm2): + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + str w6, [x0, #CNT] + str x7, [x0, #DIF] + + mov w0, w15 + add sp, sp, #48 + ret +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update .8h, .16b, 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update .8h, .16b, 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + ld1 {v0.4h}, [x1] // cdf + add x16, x0, #RNG + movi v31.4h, #0x7f, lsl #8 // 0x7f00 + movrel x17, coeffs, 30-2*3 + mvni v30.4h, #0x3f // 0xffc0 + ldrh w9, [x1, #6] // count = cdf[n_symbols] + ld1r {v3.4h}, [x16] // rng + movrel x16, bits + ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) + add x17, x0, #DIF + 6 + ld1 {v16.8h}, [x16] + mov w13, #-24 + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + ldr w10, [x0, #ALLOW_UPDATE_CDF] + ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] +1: + and v7.8b, v3.8b, v31.8b // rng & 0x7f00 + sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + str h3, [sp, #14] // store original u = s->rng + cmhs v2.8h, v1.8h, v4.8h // c >= v + str q4, [sp, #16] // store v values to allow indexed access + and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask + addv h6, v6.8h // Aggregate mask bits + umov w3, v6.h[0] + add w13, w13, #5 + rbit w3, w3 + add x8, sp, #16 + clz w15, w3 // ret + + cbz w10, 2f + // update_cdf + movi v5.8b, #0xff + mov w4, #-5 + urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) + sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6.4h, w4 // -rate + + sub w9, w9, w9, lsr #5 // count - (count == 32) + sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) + sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + add w9, w9, #1 // count + (count < 32) + add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + st1 {v0.4h}, [x1] + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + strh w9, [x1, #6] + +2: + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + dup v3.4h, w4 + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + lsl w15, w15, #1 + sub w15, w15, #5 + lsr x12, x7, #48 + adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 + dup v1.8h, w12 + b.cc 1b // loop if !carry + add w13, w13, #30 + str w6, [x0, #CNT] + add sp, sp, #48 + str x7, [x0, #DIF] + lsr w0, w13, #1 + ret +endfunc + +function msac_decode_bool_equi_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + bic w4, w5, #0xff // r &= 0xff00 + add w4, w4, #8 + subs x8, x7, x4, lsl #47 // dif - vw + lsr w4, w4, #1 // v + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + bic w1, w1, #0x3f // f &= ~63 + mul w4, w4, w1 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + ldr w9, [x1] // cdf[0-1] + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + and w2, w9, #0xffc0 // f &= ~63 + mul w4, w4, w2 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + ldr w10, [x0, #ALLOW_UPDATE_CDF] + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + + cbz w10, L(renorm2) + + lsr w2, w9, #16 // count = cdf[1] + and w9, w9, #0xffff // cdf[0] + + sub w3, w2, w2, lsr #5 // count - (count >= 32) + lsr w2, w2, #4 // count >> 4 + add w10, w3, #1 // count + (count < 32) + add w2, w2, #4 // rate = (count >> 4) | 4 + + sub w9, w9, w15 // cdf[0] -= bit + sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} + asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate + sub w9, w9, w11 // cdf[0] + + strh w9, [x1] + strh w10, [x1, #2] + + b L(renorm2) +endfunc diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S new file mode 100644 index 0000000000..becd4c08f6 --- /dev/null +++ b/third_party/dav1d/src/arm/64/refmvs.S @@ -0,0 +1,91 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, +// int bx4, int bw4, int bh4) + +function splat_mv_neon, export=1 + ld1 {v3.16b}, [x1] + clz w3, w3 + adr x5, L(splat_tbl) + sub w3, w3, #26 + ext v2.16b, v3.16b, v3.16b, #12 + ldrh w3, [x5, w3, uxtw #1] + add w2, w2, w2, lsl #1 + ext v0.16b, v2.16b, v3.16b, #4 + sub x3, x5, w3, uxtw + ext v1.16b, v2.16b, v3.16b, #8 + lsl w2, w2, #2 + ext v2.16b, v2.16b, v3.16b, #12 +1: + ldr x1, [x0], #8 + subs w4, w4, #1 + add x1, x1, x2 + br x3 + +10: + AARCH64_VALID_JUMP_TARGET + st1 {v0.8b}, [x1] + str s2, [x1, #8] + b.gt 1b + ret +20: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b}, [x1] + str d1, [x1, #16] + b.gt 1b + ret +320: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +160: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +80: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 +40: + AARCH64_VALID_JUMP_TARGET + st1 {v0.16b, v1.16b, v2.16b}, [x1] + b.gt 1b + ret + +L(splat_tbl): + .hword L(splat_tbl) - 320b + .hword L(splat_tbl) - 160b + .hword L(splat_tbl) - 80b + .hword L(splat_tbl) - 40b + .hword L(splat_tbl) - 20b + .hword L(splat_tbl) - 10b +endfunc diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S new file mode 100644 index 0000000000..9013fd4b1e --- /dev/null +++ b/third_party/dav1d/src/arm/64/util.S @@ -0,0 +1,229 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2015 Martin Storsjo + * Copyright © 2015 Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#ifndef DAV1D_SRC_ARM_64_UTIL_S +#define DAV1D_SRC_ARM_64_UTIL_S + +#include "config.h" +#include "src/arm/asm.S" + +.macro movrel rd, val, offset=0 +#if defined(__APPLE__) + .if \offset < 0 + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset)@PAGE + add \rd, \rd, \val+(\offset)@PAGEOFF + .endif +#elif defined(PIC) && defined(_WIN32) + .if \offset < 0 + adrp \rd, \val + add \rd, \rd, :lo12:\val + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) + .endif +#elif defined(PIC) + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) +#else + ldr \rd, =\val+\offset +#endif +.endm + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl + // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 + zip1 \r0\().16b, \r0\().16b, \r1\().16b + // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 + zip1 \r2\().16b, \r2\().16b, \r3\().16b + // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 + zip1 \r4\().16b, \r4\().16b, \r5\().16b + // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 + zip1 \r6\().16b, \r6\().16b, \r7\().16b + + // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 + trn1 \r1\().8h, \r0\().8h, \r2\().8h + // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 + trn2 \r3\().8h, \r0\().8h, \r2\().8h + // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 + trn1 \r5\().8h, \r4\().8h, \r6\().8h + // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 + trn2 \r7\().8h, \r4\().8h, \r6\().8h + + // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 + trn1 \r0\().4s, \r1\().4s, \r5\().4s + // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 + trn2 \r2\().4s, \r1\().4s, \r5\().4s + // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 + trn1 \r1\().4s, \r3\().4s, \r7\().4s + // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 + trn2 \r3\().4s, \r3\().4s, \r7\().4s + + \xtl\()2 \r4\().8h, \r0\().16b + \xtl \r0\().8h, \r0\().8b + \xtl\()2 \r6\().8h, \r2\().16b + \xtl \r2\().8h, \r2\().8b + \xtl\()2 \r5\().8h, \r1\().16b + \xtl \r1\().8h, \r1\().8b + \xtl\()2 \r7\().8h, \r3\().16b + \xtl \r3\().8h, \r3\().8b +.endm + +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 + trn1 \t8\().8h, \r0\().8h, \r1\().8h + trn2 \t9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \t9\().4s, \r3\().4s + trn2 \t9\().4s, \t9\().4s, \r3\().4s + trn1 \r3\().4s, \t8\().4s, \r1\().4s + trn2 \t8\().4s, \t8\().4s, \r1\().4s + + trn1 \r0\().2d, \r3\().2d, \r4\().2d + trn2 \r4\().2d, \r3\().2d, \r4\().2d + trn1 \r1\().2d, \r5\().2d, \r6\().2d + trn2 \r5\().2d, \r5\().2d, \r6\().2d + trn2 \r6\().2d, \t8\().2d, \r2\().2d + trn1 \r2\().2d, \t8\().2d, \r2\().2d + trn1 \r3\().2d, \t9\().2d, \r7\().2d + trn2 \r7\().2d, \t9\().2d, \r7\().2d +.endm + +.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 + trn1 \t8\().16b, \r0\().16b, \r1\().16b + trn2 \t9\().16b, \r0\().16b, \r1\().16b + trn1 \r1\().16b, \r2\().16b, \r3\().16b + trn2 \r3\().16b, \r2\().16b, \r3\().16b + trn1 \r0\().16b, \r4\().16b, \r5\().16b + trn2 \r5\().16b, \r4\().16b, \r5\().16b + trn1 \r2\().16b, \r6\().16b, \r7\().16b + trn2 \r7\().16b, \r6\().16b, \r7\().16b + + trn1 \r4\().8h, \r0\().8h, \r2\().8h + trn2 \r2\().8h, \r0\().8h, \r2\().8h + trn1 \r6\().8h, \r5\().8h, \r7\().8h + trn2 \r7\().8h, \r5\().8h, \r7\().8h + trn1 \r5\().8h, \t9\().8h, \r3\().8h + trn2 \t9\().8h, \t9\().8h, \r3\().8h + trn1 \r3\().8h, \t8\().8h, \r1\().8h + trn2 \t8\().8h, \t8\().8h, \r1\().8h + + trn1 \r0\().4s, \r3\().4s, \r4\().4s + trn2 \r4\().4s, \r3\().4s, \r4\().4s + trn1 \r1\().4s, \r5\().4s, \r6\().4s + trn2 \r5\().4s, \r5\().4s, \r6\().4s + trn2 \r6\().4s, \t8\().4s, \r2\().4s + trn1 \r2\().4s, \t8\().4s, \r2\().4s + trn1 \r3\().4s, \t9\().4s, \r7\().4s + trn2 \r7\().4s, \t9\().4s, \r7\().4s +.endm + +.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16b, \r0\().16b, \r1\().16b + trn2 \t5\().16b, \r0\().16b, \r1\().16b + trn1 \t6\().16b, \r2\().16b, \r3\().16b + trn2 \t7\().16b, \r2\().16b, \r3\().16b + + trn1 \r0\().8h, \t4\().8h, \t6\().8h + trn2 \r2\().8h, \t4\().8h, \t6\().8h + trn1 \r1\().8h, \t5\().8h, \t7\().8h + trn2 \r3\().8h, \t5\().8h, \t7\().8h +.endm + +.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4h, \r0\().4h, \r1\().4h + trn2 \t5\().4h, \r0\().4h, \r1\().4h + trn1 \t6\().4h, \r2\().4h, \r3\().4h + trn2 \t7\().4h, \r2\().4h, \r3\().4h + + trn1 \r0\().2s, \t4\().2s, \t6\().2s + trn2 \r2\().2s, \t4\().2s, \t6\().2s + trn1 \r1\().2s, \t5\().2s, \t7\().2s + trn2 \r3\().2s, \t5\().2s, \t7\().2s +.endm + +.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4s, \r0\().4s, \r1\().4s + trn2 \t5\().4s, \r0\().4s, \r1\().4s + trn1 \t6\().4s, \r2\().4s, \r3\().4s + trn2 \t7\().4s, \r2\().4s, \r3\().4s + + trn1 \r0\().2d, \t4\().2d, \t6\().2d + trn2 \r2\().2d, \t4\().2d, \t6\().2d + trn1 \r1\().2d, \t5\().2d, \t7\().2d + trn2 \r3\().2d, \t5\().2d, \t7\().2d +.endm + +.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \r0\().4s, \t4\().4s, \t6\().4s + trn2 \r2\().4s, \t4\().4s, \t6\().4s + trn1 \r1\().4s, \t5\().4s, \t7\().4s + trn2 \r3\().4s, \t5\().4s, \t7\().4s +.endm + +#endif /* DAV1D_SRC_ARM_64_UTIL_S */ diff --git a/third_party/dav1d/src/arm/asm-offsets.h b/third_party/dav1d/src/arm/asm-offsets.h new file mode 100644 index 0000000000..2f3c3caa1f --- /dev/null +++ b/third_party/dav1d/src/arm/asm-offsets.h @@ -0,0 +1,43 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ARM_ASM_OFFSETS_H +#define ARM_ASM_OFFSETS_H + +#define FGD_SEED 0 +#define FGD_AR_COEFF_LAG 92 +#define FGD_AR_COEFFS_Y 96 +#define FGD_AR_COEFFS_UV 120 +#define FGD_AR_COEFF_SHIFT 176 +#define FGD_GRAIN_SCALE_SHIFT 184 + +#define FGD_SCALING_SHIFT 88 +#define FGD_UV_MULT 188 +#define FGD_UV_LUMA_MULT 196 +#define FGD_UV_OFFSET 204 +#define FGD_CLIP_TO_RESTRICTED_RANGE 216 + +#endif /* ARM_ASM_OFFSETS_H */ diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S new file mode 100644 index 0000000000..dc50415f1f --- /dev/null +++ b/third_party/dav1d/src/arm/asm.S @@ -0,0 +1,291 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_ASM_S +#define DAV1D_SRC_ARM_ASM_S + +#include "config.h" + +#if ARCH_AARCH64 +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 + +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the Arm® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + */ +#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET +#define AARCH64_VALID_CALL_TARGET +#define AARCH64_VALID_JUMP_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) + +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +#define AARCH64_SIGN_LINK_REGISTER paciasp +#define AARCH64_VALIDATE_LINK_REGISTER autiasp +#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp +#else +#error Pointer authentication defines no valid key! +#endif +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions +#error Authentication of leaf functions is enabled but not supported in dav1d! +#endif +#define GNU_PROPERTY_AARCH64_PAC (1 << 1) + +#elif defined(__APPLE__) && defined(__arm64e__) + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp + +#else /* __ARM_FEATURE_PAC_DEFAULT */ + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER +#define AARCH64_VALIDATE_LINK_REGISTER + +#endif /* !__ARM_FEATURE_PAC_DEFAULT */ + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) + .pushsection .note.gnu.property, "a" + .balign 8 + .long 4 + .long 0x10 + .long 0x5 + .asciz "GNU" + .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4 + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) + .long 0 + .popsection +#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ +#endif /* ARCH_AARCH64 */ + +#if ARCH_ARM + .syntax unified +#ifdef __ELF__ + .arch armv7-a + .fpu neon + .eabi_attribute 10, 0 // suppress Tag_FP_arch + .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch + .section .note.GNU-stack,"",%progbits // Mark stack as non-executable +#endif /* __ELF__ */ + +#ifdef _WIN32 +#define CONFIG_THUMB 1 +#else +#define CONFIG_THUMB 0 +#endif + +#if CONFIG_THUMB + .thumb +#define A @ +#define T +#else +#define A +#define T @ +#endif /* CONFIG_THUMB */ +#endif /* ARCH_ARM */ + +#if !defined(PIC) +#if defined(__PIC__) +#define PIC __PIC__ +#elif defined(__pic__) +#define PIC __pic__ +#endif +#endif + +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX dav1d_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define EXTERN CONCAT(_,PRIVATE_PREFIX) +#else +#define EXTERN PRIVATE_PREFIX +#endif + +.macro function name, export=0, align=2 + .macro endfunc +#ifdef __ELF__ + .size \name, . - \name +#endif +#if HAVE_AS_FUNC + .endfunc +#endif + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .type EXTERN\name, %function + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +#if HAVE_AS_FUNC + .func EXTERN\name +#endif +EXTERN\name: + .else +#ifdef __ELF__ + .type \name, %function +#endif +#if HAVE_AS_FUNC + .func \name +#endif + .endif +\name: +#if ARCH_AARCH64 + .if \export + AARCH64_VALID_CALL_TARGET + .endif +#endif +.endm + +.macro const name, export=0, align=2 + .macro endconst +#ifdef __ELF__ + .size \name, . - \name +#endif + .purgem endconst + .endm +#if defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +EXTERN\name: + .endif +\name: +.endm + +#ifdef __APPLE__ +#define L(x) L ## x +#else +#define L(x) .L ## x +#endif + +#define X(x) CONCAT(EXTERN, x) + + +#endif /* DAV1D_SRC_ARM_ASM_S */ diff --git a/third_party/dav1d/src/arm/cdef.h b/third_party/dav1d/src/arm/cdef.h new file mode 100644 index 0000000000..2e8c8ab6fb --- /dev/null +++ b/third_party/dav1d/src/arm/cdef.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon)); + +void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); +void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); + +// Passing edges to this function, to allow it to switch to a more +// optimized version for fully edged cases. Using size_t for edges, +// to avoid ABI differences for passing more than one argument on the stack. +void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); +void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); + +#define DEFINE_FILTER(w, h, tmp_stride) \ +static void \ +cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, const int sec_strength, \ + const int dir, const int damping, \ + const enum CdefEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ + BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \ + left, top, bottom, h, edges); \ + BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ + sec_strength, dir, damping, h, edges \ + HIGHBD_TAIL_SUFFIX); \ +} + +DEFINE_FILTER(8, 8, 16) +DEFINE_FILTER(4, 8, 8) +DEFINE_FILTER(4, 4, 8) + +static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->dir = BF(dav1d_cdef_find_dir, neon); + c->fb[0] = cdef_filter_8x8_neon; + c->fb[1] = cdef_filter_4x8_neon; + c->fb[2] = cdef_filter_4x4_neon; +} diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c new file mode 100644 index 0000000000..b7a0d3adbc --- /dev/null +++ b/third_party/dav1d/src/arm/cpu.c @@ -0,0 +1,99 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "common/attributes.h" + +#include "src/arm/cpu.h" + +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 +// NEON is always available; runtime tests are not needed. +#elif defined(HAVE_GETAUXVAL) && ARCH_ARM +#include <sys/auxv.h> + +#ifndef HWCAP_ARM_NEON +#define HWCAP_ARM_NEON (1 << 12) +#endif +#define NEON_HWCAP HWCAP_ARM_NEON + +#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM +#include <sys/auxv.h> + +#define NEON_HWCAP HWCAP_NEON + +#elif defined(__ANDROID__) +#include <stdio.h> +#include <string.h> + +static unsigned parse_proc_cpuinfo(const char *flag) { + FILE *file = fopen("/proc/cpuinfo", "r"); + if (!file) + return 0; + + char line_buffer[120]; + const char *line; + + while ((line = fgets(line_buffer, sizeof(line_buffer), file))) { + if (strstr(line, flag)) { + fclose(file); + return 1; + } + // if line is incomplete seek back to avoid splitting the search + // string into two buffers + if (!strchr(line, '\n') && strlen(line) > strlen(flag)) { + // use fseek since the 64 bit fseeko is only available since + // Android API level 24 and meson defines _FILE_OFFSET_BITS + // by default 64 + if (fseek(file, -strlen(flag), SEEK_CUR)) + break; + } + } + + fclose(file); + + return 0; +} +#endif + +COLD unsigned dav1d_get_cpu_flags_arm(void) { + unsigned flags = 0; +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 + flags |= DAV1D_ARM_CPU_FLAG_NEON; +#elif defined(HAVE_GETAUXVAL) && ARCH_ARM + unsigned long hw_cap = getauxval(AT_HWCAP); + flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM + unsigned long hw_cap = 0; + elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap)); + flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#elif defined(__ANDROID__) + flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0; +#endif + + return flags; +} diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h new file mode 100644 index 0000000000..8c10a1b6b0 --- /dev/null +++ b/third_party/dav1d/src/arm/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_CPU_H +#define DAV1D_SRC_ARM_CPU_H + +enum CpuFlags { + DAV1D_ARM_CPU_FLAG_NEON = 1 << 0, +}; + +unsigned dav1d_get_cpu_flags_arm(void); + +#endif /* DAV1D_SRC_ARM_CPU_H */ diff --git a/third_party/dav1d/src/arm/filmgrain.h b/third_party/dav1d/src/arm/filmgrain.h new file mode 100644 index 0000000000..48776ac852 --- /dev/null +++ b/third_party/dav1d/src/arm/filmgrain.h @@ -0,0 +1,204 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" +#include "asm-offsets.h" + +CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); + +CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); +CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); + +void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX); + +#define GEN_GRAIN_UV(suff) \ +void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, \ + const intptr_t uv \ + HIGHBD_DECL_SUFFIX) + +GEN_GRAIN_UV(420); +GEN_GRAIN_UV(422); +GEN_GRAIN_UV(444); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// same layout of parameters on the stack across platforms. +void BF(dav1d_fgy_32x32, neon)(pixel *const dst, + const pixel *const src, + const ptrdiff_t stride, + const uint8_t scaling[SCALING_SIZE], + const int scaling_shift, + const entry grain_lut[][GRAIN_WIDTH], + const int offsets[][2], + const int h, const ptrdiff_t clip, + const ptrdiff_t type + HIGHBD_DECL_SUFFIX); + +static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + int type = 0; + if (data->overlap_flag && row_num) + type |= 1; /* overlap y */ + if (data->overlap_flag && bx) + type |= 2; /* overlap x */ + + BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, + scaling, data->scaling_shift, + grain_lut, offsets, bh, + data->clip_to_restricted_range, type + HIGHBD_TAIL_SUFFIX); + } +} + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(nm, sx, sy) \ +void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX); \ +static void \ +fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ + const size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], const int bh, \ + const int row_num, const pixel *const luma_row, \ + const ptrdiff_t luma_stride, const int uv, const int is_id \ + HIGHBD_DECL_SUFFIX) \ +{ \ + const int rows = 1 + (data->overlap_flag && row_num > 0); \ + \ + /* seed[0] contains the current row, seed[1] contains the previous */ \ + unsigned seed[2]; \ + for (int i = 0; i < rows; i++) { \ + seed[i] = data->seed; \ + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ + } \ + \ + int offsets[2 /* col offset */][2 /* row offset */]; \ + \ + /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + if (data->overlap_flag && bx) { \ + /* shift previous offsets left */ \ + for (int i = 0; i < rows; i++) \ + offsets[1][i] = offsets[0][i]; \ + } \ + \ + /* update current offsets */ \ + for (int i = 0; i < rows; i++) \ + offsets[0][i] = get_random_number(8, &seed[i]); \ + \ + int type = 0; \ + if (data->overlap_flag && row_num) \ + type |= 1; /* overlap y */ \ + if (data->overlap_flag && bx) \ + type |= 2; /* overlap x */ \ + if (data->chroma_scaling_from_luma) \ + type |= 4; \ + \ + BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ + scaling, data, grain_lut, \ + luma_row + (bx << sx), luma_stride, \ + offsets, bh, uv, is_id, type \ + HIGHBD_TAIL_SUFFIX); \ + } \ +} + +FGUV(420, 1, 1); +FGUV(422, 1, 0); +FGUV(444, 0, 0); + +static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); + + c->fgy_32x32xn = fgy_32x32xn_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; +} diff --git a/third_party/dav1d/src/arm/ipred.h b/third_party/dav1d/src/arm/ipred.h new file mode 100644 index 0000000000..e849d4998b --- /dev/null +++ b/third_party/dav1d/src/arm/ipred.h @@ -0,0 +1,222 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); + +decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); + +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); + +decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); + +#if ARCH_AARCH64 +void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz, + const pixel *const in, + const int end HIGHBD_DECL_SUFFIX); +void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz, + const pixel *const in, + const int end, const int strength); +void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px, + const int n); +void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, const int width, + const int height, const int dx, + const int max_base_x); +void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, const int width, + const int height, const int dx, + const int max_base_x); + +static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + int dx = dav1d_dr_intra_derivative[angle >> 1]; + pixel top_out[64 + 64 + (64+15)*2 + 16]; + int max_base_x; + const int upsample_above = enable_intra_edge_filter ? + get_upsample(width + height, 90 - angle, is_sm) : 0; + if (upsample_above) { + BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height, + topleft_in, + width + imin(width, height) + HIGHBD_TAIL_SUFFIX); + max_base_x = 2 * (width + height) - 2; + dx <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, 90 - angle, is_sm) : 0; + if (filter_strength) { + BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height, + topleft_in, + width + imin(width, height), + filter_strength); + max_base_x = width + height - 1; + } else { + max_base_x = width + imin(width, height) - 1; + memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel)); + } + } + const int base_inc = 1 + upsample_above; + int pad_pixels = width + 15; // max(dx >> 6) == 15 + BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1], + top_out[max_base_x], pad_pixels * base_inc); + if (upsample_above) + BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height, + dx, max_base_x); + else + BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height, + dx, max_base_x); +} + +void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const left, const int width, + const int height, const int dy, + const int max_base_y); +void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const left, const int width, + const int height, const int dy, + const int max_base_y); + +void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src, + const int n); + +static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle > 180); + int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1]; + pixel flipped[64 + 64 + 16]; + pixel left_out[64 + 64 + (64+15)*2]; + int max_base_y; + const int upsample_left = enable_intra_edge_filter ? + get_upsample(width + height, angle - 180, is_sm) : 0; + if (upsample_left) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height + imax(width, height)); + BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height, + flipped, + height + imin(width, height) + HIGHBD_TAIL_SUFFIX); + max_base_y = 2 * (width + height) - 2; + dy <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, angle - 180, is_sm) : 0; + + if (filter_strength) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height + imax(width, height)); + BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height, + flipped, + height + imin(width, height), + filter_strength); + max_base_y = width + height - 1; + } else { + BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0], + height + imin(width, height)); + max_base_y = height + imin(width, height) - 1; + } + } + const int base_inc = 1 + upsample_left; + // The tbx based implementation needs left[] to have 64 bytes intitialized, + // the other implementation can read height + max(dy >> 6) past the end. + int pad_pixels = imax(64 - max_base_y - 1, height + 15); + + BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1], + left_out[max_base_y], pad_pixels * base_inc); + if (upsample_left) + BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height, + dy, max_base_y); + else + BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height, + dy, max_base_y); +} +#endif + +static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); + c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); + c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); + c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); + c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); + c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); + c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); + c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); + c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); + c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); +#if ARCH_AARCH64 + c->intra_pred[Z1_PRED] = ipred_z1_neon; + c->intra_pred[Z3_PRED] = ipred_z3_neon; +#endif + c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); + + c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); + c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); + c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); + c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); + + c->pal_pred = BF(dav1d_pal_pred, neon); +} diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h new file mode 100644 index 0000000000..2ecd086b3b --- /dev/null +++ b/third_party/dav1d/src/arm/itx.h @@ -0,0 +1,141 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +decl_itx17_fns( 4, 4, neon); +decl_itx16_fns( 4, 8, neon); +decl_itx16_fns( 4, 16, neon); +decl_itx16_fns( 8, 4, neon); +decl_itx16_fns( 8, 8, neon); +decl_itx16_fns( 8, 16, neon); +decl_itx2_fns ( 8, 32, neon); +decl_itx16_fns(16, 4, neon); +decl_itx16_fns(16, 8, neon); +decl_itx12_fns(16, 16, neon); +decl_itx2_fns (16, 32, neon); +decl_itx2_fns (32, 8, neon); +decl_itx2_fns (32, 16, neon); +decl_itx2_fns (32, 32, neon); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); + +static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + if (BITDEPTH == 16 && bpc != 10) return; + + assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn(R, 4, 8, neon); + assign_itx16_fn(R, 4, 16, neon); + assign_itx16_fn(R, 8, 4, neon); + assign_itx16_fn( , 8, 8, neon); + assign_itx16_fn(R, 8, 16, neon); + assign_itx2_fn (R, 8, 32, neon); + assign_itx16_fn(R, 16, 4, neon); + assign_itx16_fn(R, 16, 8, neon); + assign_itx12_fn( , 16, 16, neon); + assign_itx2_fn (R, 16, 32, neon); + assign_itx1_fn (R, 16, 64, neon); + assign_itx2_fn (R, 32, 8, neon); + assign_itx2_fn (R, 32, 16, neon); + assign_itx2_fn ( , 32, 32, neon); + assign_itx1_fn (R, 32, 64, neon); + assign_itx1_fn (R, 64, 16, neon); + assign_itx1_fn (R, 64, 32, neon); + assign_itx1_fn ( , 64, 64, neon); +} diff --git a/third_party/dav1d/src/arm/loopfilter.h b/third_party/dav1d/src/arm/loopfilter.h new file mode 100644 index 0000000000..9ac08d94d2 --- /dev/null +++ b/third_party/dav1d/src/arm/loopfilter.h @@ -0,0 +1,45 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon)); + +static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon); +} diff --git a/third_party/dav1d/src/arm/looprestoration.h b/third_party/dav1d/src/arm/looprestoration.h new file mode 100644 index 0000000000..7993dbff68 --- /dev/null +++ b/third_party/dav1d/src/arm/looprestoration.h @@ -0,0 +1,265 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#if ARCH_AARCH64 +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +#else + +// The 8bpc version calculates things slightly differently than the reference +// C version. That version calculates roughly this: +// int16_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += src[idx] * fh[i]; +// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; +// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; +// sum += 1 << (bitdepth + 6 - round_bits_h); +// Compared to the reference C version, this is the output of the first pass +// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. +// with round_offset precompensated. +// The 16bpc version calculates things pretty much the same way as the +// reference C version, but with the end result subtracted by +// 1 << (bitdepth + 6 - round_bits_h). +void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], + const pixel *src, ptrdiff_t stride, + const int16_t fh[8], intptr_t w, + int h, enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +// This calculates things slightly differently than the reference C version. +// This version calculates roughly this: +// int32_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += mid[idx] * fv[i]; +// sum = (sum + rounding_off_v) >> round_bits_v; +// This function assumes that the width is a multiple of 8. +void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, + const int16_t *mid, int w, int h, + const int16_t fv[8], enum LrEdgeFlags edges, + ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); + +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + ALIGN_STK_16(int16_t, mid, 68 * 384,); + int mid_stride = (w + 7) & ~7; + + // Horizontal filter + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride, + filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_TOP) + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride, + filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, + lpf + 6 * PXSTRIDE(stride), + stride, filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + + // Vertical filter + BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride], + w, h, filter[1], edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); +} +#endif + +void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 3x3 box (radius=1) */ +static void dav1d_sgr_filter1_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 5x5 box (radius=2) */ +static void dav1d_sgr_filter2_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int w, const int h, + const int wt HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp1, 64 * 384,); + ALIGN_STK_16(int16_t, tmp2, 64 * 384,); + dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; + BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride, + tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); +} + +static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if ARCH_AARCH64 + c->wiener[0] = BF(dav1d_wiener_filter7, neon); + c->wiener[1] = BF(dav1d_wiener_filter5, neon); +#else + c->wiener[0] = c->wiener[1] = wiener_filter_neon; +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = sgr_filter_5x5_neon; + c->sgr[1] = sgr_filter_3x3_neon; + c->sgr[2] = sgr_filter_mix_neon; + } +} diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h new file mode 100644 index 0000000000..06cd533a9b --- /dev/null +++ b/third_party/dav1d/src/arm/mc.h @@ -0,0 +1,114 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/mc.h" +#include "src/cpu.h" + +decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); + +decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_bilin, neon)); + +decl_avg_fn(BF(dav1d_avg, neon)); +decl_w_avg_fn(BF(dav1d_w_avg, neon)); +decl_mask_fn(BF(dav1d_mask, neon)); +decl_blend_fn(BF(dav1d_blend, neon)); +decl_blend_dir_fn(BF(dav1d_blend_h, neon)); +decl_blend_dir_fn(BF(dav1d_blend_v, neon)); + +decl_w_mask_fn(BF(dav1d_w_mask_444, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_422, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); + +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + +static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + + c->avg = BF(dav1d_avg, neon); + c->w_avg = BF(dav1d_w_avg, neon); + c->mask = BF(dav1d_mask, neon); + c->blend = BF(dav1d_blend, neon); + c->blend_h = BF(dav1d_blend_h, neon); + c->blend_v = BF(dav1d_blend_v, neon); + c->w_mask[0] = BF(dav1d_w_mask_444, neon); + c->w_mask[1] = BF(dav1d_w_mask_422, neon); + c->w_mask[2] = BF(dav1d_w_mask_420, neon); + c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); + c->emu_edge = BF(dav1d_emu_edge, neon); +} diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h new file mode 100644 index 0000000000..9db0bf86ae --- /dev/null +++ b/third_party/dav1d/src/arm/msac.h @@ -0,0 +1,52 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_ARM_MSAC_H +#define DAV1D_SRC_ARM_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); +unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); + +#if ARCH_AARCH64 || defined(__ARM_NEON) +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon +#endif + +#endif /* DAV1D_SRC_ARM_MSAC_H */ diff --git a/third_party/dav1d/src/arm/refmvs.h b/third_party/dav1d/src/arm/refmvs.h new file mode 100644 index 0000000000..4c96fc5095 --- /dev/null +++ b/third_party/dav1d/src/arm/refmvs.h @@ -0,0 +1,39 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/refmvs.h" + +decl_splat_mv_fn(dav1d_splat_mv_neon); + +static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->splat_mv = dav1d_splat_mv_neon; +} |