diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/dav1d/src/loongarch/mc.S | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/loongarch/mc.S')
-rw-r--r-- | third_party/dav1d/src/loongarch/mc.S | 4758 |
1 files changed, 4758 insertions, 0 deletions
diff --git a/third_party/dav1d/src/loongarch/mc.S b/third_party/dav1d/src/loongarch/mc.S new file mode 100644 index 0000000000..97887de4a7 --- /dev/null +++ b/third_party/dav1d/src/loongarch/mc.S @@ -0,0 +1,4758 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *const abcd, int mx, int my + HIGHBD_DECL_SUFFIX) +*/ +.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3 + vbsrl.v vr2, \in0, \in1 + vbsrl.v vr20, \in0, \in2 + addi.w t4, \in3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr1, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr29, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + vilvl.d vr2, vr20, vr2 + vilvl.d vr1, vr29, vr1 + vmulwev.h.bu.b vr3, vr2, vr1 + vmulwod.h.bu.b vr20, vr2, vr1 + vilvl.d vr2, vr20, vr3 + vhaddw.w.h vr2, vr2, vr2 + vhaddw.d.w vr2, vr2, vr2 + vhaddw.q.d vr2, vr2, vr2 + vilvh.d vr3, vr20, vr3 + vhaddw.w.h vr3, vr3, vr3 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr2, \out1 + vextrins.w \out2, vr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + vsllwil.h.b vr1, vr1, 0 + vmulwev.w.h vr3, \in2, vr1 + vmaddwod.w.h vr3, \in2, vr1 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr3, \out1 +.endm + +const warp_sh +.rept 2 +.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +.endr +.rept 2 +.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +.endr +endconst + +.macro warp_lsx t, shift +function warp_affine_8x8\t\()_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + la.local t4, warp_sh + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + vld vr30, t4, 0 + vld vr31, t4, 32 + + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, t1, a5 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30 + + vsrarni.h.w vr12, vr4, 3 + vsrarni.h.w vr13, vr5, 3 + vsrarni.h.w vr14, vr6, 3 + vsrarni.h.w vr15, vr7, 3 + vsrarni.h.w vr16, vr8, 3 + vsrarni.h.w vr17, vr9, 3 + vsrarni.h.w vr18, vr10, 3 + vsrarni.h.w vr19, vr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20 + + vsrarni.h.w vr21, vr4, 3 + vsrarni.h.w vr22, vr5, 3 + vsrarni.h.w vr23, vr6, 3 + vsrarni.h.w vr24, vr7, 3 + vsrarni.h.w vr25, vr8, 3 + vsrarni.h.w vr26, vr9, 3 + vsrarni.h.w vr27, vr10, 3 + vsrarni.h.w vr28, vr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc +.endm + +warp_lsx , 11 +warp_lsx t, 7 + +.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 + xvshuf.b xr2, \in0, \in0, \in2 + + addi.w t4, \in1, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr3, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr4, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr5, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr6, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + xvinsve0.d xr3, xr5, 1 + xvinsve0.d xr3, xr4, 2 + xvinsve0.d xr3, xr6, 3 + + xvmulwev.h.bu.b xr4, xr2, xr3 + xvmulwod.h.bu.b xr5, xr2, xr3 + xvilvl.d xr2, xr5, xr4 + xvilvh.d xr3, xr5, xr4 + xvhaddw.w.h xr2, xr2, xr2 + xvhaddw.w.h xr3, xr3, xr3 + xvhaddw.d.w xr2, xr2, xr2 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr2, xr2, xr2 + xvhaddw.q.d xr3, xr3, xr3 + + xvextrins.w \out0, xr2, \out1 + xvextrins.w \out2, xr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + + add.w t2, t2, t7 + addi.w t6, t2, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f2, t5, t6 + + vilvl.d vr0, vr2, vr1 + vext2xv.h.b xr0, xr0 + xvmulwev.w.h xr3, \in2, xr0 + xvmaddwod.w.h xr3, \in2, xr0 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr3, xr3, xr3 + xvextrins.w \out0, xr3, \out1 +.endm + +const shuf0 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 +endconst + +.macro warp_lasx t, shift +function warp_affine_8x8\t\()_8bpc_lasx + addi.d sp, sp, -16 + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + fst.d f24, sp, 0 + fst.d f25, sp, 8 + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t4, warp_sh + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + xvld xr24, t4, 0 + xvld xr25, t4, 32 + la.local t2, shuf0 + xvld xr1, t2, 0 + xvpermi.q xr0, xr0, 0x00 + xvaddi.bu xr9, xr1, 4 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 + + xvsrarni.h.w xr12, xr7, 3 + xvsrarni.h.w xr13, xr8, 3 + xvsrarni.h.w xr14, xr10, 3 + xvsrarni.h.w xr15, xr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 + + xvsrarni.h.w xr16, xr7, 3 + xvsrarni.h.w xr17, xr8, 3 + xvsrarni.h.w xr18, xr10, 3 + xvsrarni.h.w xr19, xr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + // y = 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, \shift + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + fld.d f24, sp, 0 + fld.d f25, sp, 8 + addi.d sp, sp, 16 +endfunc +.endm + +warp_lasx , 11 +warp_lasx t, 7 + +/* +static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, + const int w, int h, + const int weight HIGHBD_DECL_SUFFIX) +*/ + +#define bpc8_sh 5 // sh = intermediate_bits + 1 +#define bpcw8_sh 8 // sh = intermediate_bits + 4 + +#define bpc_sh bpc8_sh +#define bpcw_sh bpcw8_sh + +function avg_8bpc_lsx + addi.d t8, a0, 0 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE + add.d t1, t1, t2 // Get absolute address + jirl $r0, t1, 0 + + .align 3 +.AVG_LSX_JRTABLE: + .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE + +.AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr2, vr0, vr1 + vssrarni.bu.h vr3, vr2, bpc_sh + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LSX + b .AVG_END_LSX + +.AVG_W8_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -2 + addi.d a2, a2, 32 + vstelm.d vr5, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr5, a0, 0, 1 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W8_LSX + b .AVG_END_LSX + +.AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 32 + vst vr5, a0, 0 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W16_LSX + b .AVG_END_LSX + +.AVG_W32_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr4, a2, 32 + vld vr6, a2, 48 + vld vr1, a3, 0 + vld vr3, a3, 16 + vld vr5, a3, 32 + vld vr7, a3, 48 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vadd.h vr4, vr4, vr5 + vadd.h vr6, vr6, vr7 + vssrarni.bu.h vr2, vr0, bpc_sh + vssrarni.bu.h vr6, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 64 + vst vr2, a0, 0 + vst vr6, a0, 16 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LSX + b .AVG_END_LSX + +.AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W64_LSX + b .AVG_END_LSX + +.AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W128_LSX +.AVG_END_LSX: +endfunc + +function avg_8bpc_lasx + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.AVG_LASX_JRTABLE: + .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE + +.AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr0, vr0, vr1 + vssrarni.bu.h vr1, vr0, bpc_sh + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LASX + b .AVG_END_LASX +.AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvadd.h xr2, xr0, xr1 + xvssrarni.bu.h xr1, xr2, bpc_sh + xvstelm.d xr1, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr1, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a1, a0 + blt zero, a5, .AVG_W8_LASX + b .AVG_END_LASX +.AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr2, xr5, 0xd8 + xvpermi.d xr3, xr5, 0x8d + vst vr2, a0, 0 + vstx vr3, a0, a1 + addi.w a5, a5, -2 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + alsl.d a0, a1, a0, 1 + blt zero, a5, .AVG_W16_LASX + b .AVG_END_LASX +.AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr6, xr5, 0xd8 + xvst xr6, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LASX + b .AVG_END_LASX +.AVG_W64_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + addi.w a5, a5, -1 + addi.d a2, a2, 128 + addi.d a3, a3, 128 + add.d a0, a0, a1 + blt zero, a5, .AVG_W64_LASX + b .AVG_END_LASX +.AVG_W128_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr8, a2, 128 + xvld xr10, a2, 160 + xvld xr12, a2, 192 + xvld xr14, a2, 224 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvld xr9, a3, 128 + xvld xr11, a3, 160 + xvld xr13, a3, 192 + xvld xr15, a3, 224 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvadd.h xr8, xr8, xr9 + xvadd.h xr10, xr10, xr11 + xvadd.h xr12, xr12, xr13 + xvadd.h xr14, xr14, xr15 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvssrarni.bu.h xr10, xr8, bpc_sh + xvssrarni.bu.h xr14, xr12, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvpermi.d xr5, xr10, 0xd8 + xvpermi.d xr7, xr14, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + xvst xr5, a0, 64 + xvst xr7, a0, 96 + addi.w a5, a5, -1 + addi.d a2, a2, 256 + addi.d a3, a3, 256 + add.d a0, a0, a1 + blt zero, a5, .AVG_W128_LASX +.AVG_END_LASX: +endfunc + +function w_avg_8bpc_lsx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + vreplgr2vr.h vr21, a6 + vreplgr2vr.h vr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LSX_JRTABLE: + .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE + +.W_AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LSX + b .W_AVG_END_LSX +.W_AVG_W8_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.d f0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LSX + b .W_AVG_END_LSX +.W_AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LSX + b .W_AVG_END_LSX +.W_AVG_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W32_LSX + b .W_AVG_END_LSX + +.W_AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LSX + b .W_AVG_END_LSX + +.W_AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LSX +.W_AVG_END_LSX: +endfunc + +function w_avg_8bpc_lasx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + xvreplgr2vr.h xr21, a6 + xvreplgr2vr.h xr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LASX_JRTABLE: + .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE + +.W_AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + xvpermi.d xr2, xr0, 0xD8 + xvpermi.d xr3, xr1, 0xD8 + xvilvl.h xr4, xr3, xr2 + xvmulwev.w.h xr0, xr4, xr21 + xvmaddwod.w.h xr0, xr4, xr22 + xvssrarni.hu.w xr1, xr0, bpcw_sh + xvssrlni.bu.h xr0, xr1, 0 + fst.s f0, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr0, a0, 0, 4 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LASX + b .W_AVG_END_LASX + +.W_AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvstelm.d xr0, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LASX + b .W_AVG_END_LASX + +.W_AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LASX + b .W_AVG_END_LSX + +.W_AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W32_LASX + b .W_AVG_END_LASX + +.W_AVG_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LASX + b .W_AVG_END_LASX + +.W_AVG_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LASX +.W_AVG_END_LASX: +endfunc + +#undef bpc_sh +#undef bpcw_sh + +#define mask_sh 10 +/* +static void mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + const uint8_t *mask HIGHBD_DECL_SUFFIX) +*/ +function mask_8bpc_lsx + vldi vr21, 0x440 // 64 + vxor.v vr19, vr19, vr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LSX_JRTABLE: + .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE + +.MASK_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vssrarni.hu.w vr5, vr4, mask_sh + vssrlrni.bu.h vr1, vr5, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LSX + b .MASK_END_LSX +.MASK_W8_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + fst.d f0, a0, 0 + add.d a0, a0, a1 + vstelm.d vr0, a0, 0, 1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LSX + b .MASK_END_LSX + +.MASK_W16_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LSX + b .MASK_END_LSX +.MASK_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LSX + b .MASK_END_LSX +.MASK_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LSX + b .MASK_END_LSX +.MASK_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LSX +.MASK_END_LSX: +endfunc + +function mask_8bpc_lasx + xvldi xr21, 0x440 // 64 + xvxor.v xr19, xr19, xr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LASX_JRTABLE: + .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE + +.MASK_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr14, vr1, vr0 + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + xvpermi.q xr14, xr4, 0x20 + vilvl.h vr5, vr3, vr2 + vilvh.h vr15, vr3, vr2 + xvpermi.q xr15, xr5, 0x20 + xvmulwev.w.h xr0, xr14, xr15 + xvmaddwod.w.h xr0, xr14, xr15 + xvssrarni.hu.w xr1, xr0, mask_sh + xvssrlni.bu.h xr2, xr1, 0 + fst.s f2, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr2, a0, 0, 4 + + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LASX + b .MASK_END_LASX + +.MASK_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + fst.d f0, a0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LASX + b .MASK_END_LASX + +.MASK_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LASX + b .MASK_END_LASX +.MASK_W32_LASX: + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LASX + b .MASK_END_LASX + +.MASK_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LASX + b .MASK_END_LASX + +.MASK_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LASX +.MASK_END_LASX: +endfunc + +/* +static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + uint8_t *mask, const int sign, + const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) +*/ +function w_mask_420_8bpc_lsx + addi.d sp, sp, -24 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + vldi vr20, 0x440 + vreplgr2vr.h vr21, a7 + vldi vr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LSX_JRTABLE: + .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE + +.WMASK420_W4_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -4 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvl.w vr2, vr11, vr10 + vilvh.w vr3, vr11, vr10 + vssrarni.hu.w vr1, vr0, 10 + vssrarni.hu.w vr3, vr2, 10 + vssrlni.bu.h vr3, vr1, 0 + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 2 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 3 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vshuf4i.h vr0, vr0, 0xd8 + vhaddw.w.h vr2, vr0, vr0 + vpickev.h vr2, vr2, vr2 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LSX + b .END_W420 + +.WMASK420_W8_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -2 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vssrarni.hu.w vr10, vr4, 10 + vssrarni.hu.w vr11, vr5, 10 + vssrlni.bu.h vr11, vr10, 0 + vshuf4i.w vr0, vr11, 0x4E + vilvl.b vr3, vr0, vr11 + vstelm.d vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr3, a0, 0, 1 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vilvh.d vr2, vr0, vr0 + vadd.h vr2, vr2, vr0 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W8_LSX + b .END_W420 + +.WMASK420_W16_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + alsl.d a2, a4, a2, 1 + vld vr2, a2, 0 + vld vr3, a2, 16 + vld vr4, a3, 0 + vld vr5, a3, 16 + alsl.d a3, a4, a3, 1 + vld vr6, a3, 0 + vld vr7, a3, 16 + + vabsd.h vr8, vr0, vr4 + vabsd.h vr9, vr1, vr5 + vabsd.h vr10, vr2, vr6 + vabsd.h vr11, vr3, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr2 + vmulwod.w.h vr24, vr14, vr2 + vmulwev.w.h vr25, vr15, vr3 + vmulwod.w.h vr26, vr15, vr3 + vmaddwev.w.h vr8, vr16, vr4 + vmaddwod.w.h vr9, vr16, vr4 + vmaddwev.w.h vr10, vr17, vr5 + vmaddwod.w.h vr11, vr17, vr5 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr0, vr11, 0x4E + vshuf4i.w vr1, vr26, 0x4E + vilvl.b vr3, vr0, vr11 + vilvl.b vr7, vr1, vr26 + vst vr3, a0, 0 + vstx vr7, a0, a1 + vpickev.h vr0, vr13, vr12 + vpickod.h vr1, vr13, vr12 + vpickev.h vr2, vr15, vr14 + vpickod.h vr3, vr15, vr14 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vadd.h vr4, vr4, vr5 + vsub.h vr4, vr4, vr21 + vssrarni.bu.h vr4, vr4, 2 + vstelm.d vr4, a6, 0, 0 + + alsl.d a2, a4, a2, 1 + alsl.d a3, a4, a3, 1 + alsl.d a0, a1, a0, 1 + addi.d a6, a6, 8 + addi.w a5, a5, -2 + blt zero, a5, .WMASK420_W16_LSX + b .END_W420 + +.WMASK420_W32_LSX: +.WMASK420_W64_LSX: +.WMASK420_W128_LSX: + +.LOOP_W32_420_LSX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 + +.W32_420_LSX: + vld vr0, t1, 0 + vld vr1, t1, 16 + vld vr2, t2, 0 + vld vr3, t2, 16 + vld vr4, t5, 0 + vld vr5, t5, 16 + vld vr6, t6, 0 + vld vr7, t6, 16 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + vabsd.h vr8, vr0, vr2 + vabsd.h vr9, vr1, vr3 + vabsd.h vr10, vr4, vr6 + vabsd.h vr11, vr5, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr4 + vmulwod.w.h vr24, vr14, vr4 + vmulwev.w.h vr25, vr15, vr5 + vmulwod.w.h vr26, vr15, vr5 + vmaddwev.w.h vr8, vr16, vr2 + vmaddwod.w.h vr9, vr16, vr2 + vmaddwev.w.h vr10, vr17, vr3 + vmaddwod.w.h vr11, vr17, vr3 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr8, vr11, 0x4E + vshuf4i.w vr9, vr26, 0x4E + vilvl.b vr3, vr8, vr11 + vilvl.b vr7, vr9, vr26 + vst vr3, t3, 0 + vstx vr7, a1, t3 + addi.d t3, t3, 16 + vpickev.h vr8, vr13, vr12 + vpickod.h vr9, vr13, vr12 + vpickev.h vr10, vr15, vr14 + vpickod.h vr11, vr15, vr14 + vadd.h vr8, vr8, vr9 + vadd.h vr10, vr10, vr11 + vadd.h vr12, vr8, vr10 + vsub.h vr12, vr12, vr21 + vssrarni.bu.h vr12, vr12, 2 + vstelm.d vr12, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LSX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LSX + +.END_W420: + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + addi.d sp, sp, 24 +endfunc + +function w_mask_420_8bpc_lasx + xvldi xr20, 0x440 + xvreplgr2vr.h xr21, a7 + xvldi xr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LASX_JRTABLE: + .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE + +.WMASK420_W4_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + addi.w a5, a5, -4 + + xvabsd.h xr2, xr0, xr1 + xvaddi.hu xr2, xr2, 8 + xvsrli.h xr2, xr2, 8 + xvadd.h xr2, xr2, xr22 + xvmin.hu xr3, xr2, xr20 + xvsub.h xr4, xr20, xr3 + xvmulwev.w.h xr5, xr3, xr0 + xvmulwod.w.h xr6, xr3, xr0 + xvmaddwev.w.h xr5, xr4, xr1 + xvmaddwod.w.h xr6, xr4, xr1 + xvilvl.w xr7, xr6, xr5 + xvilvh.w xr8, xr6, xr5 + xvssrarni.hu.w xr8, xr7, 10 + xvssrlni.bu.h xr9, xr8, 0 + vstelm.w vr9, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr9, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 4 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 5 + add.d a0, a0, a1 + + xvhaddw.w.h xr3, xr3, xr3 + xvpermi.d xr4, xr3, 0xb1 + xvadd.h xr3, xr3, xr4 + xvpickev.h xr3, xr3, xr3 + xvsub.h xr3, xr3, xr21 + xvssrarni.bu.h xr3, xr3, 2 + vstelm.h vr3, a6, 0, 0 + xvstelm.h xr3, a6, 2, 8 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LASX + b .END_W420_LASX + +.WMASK420_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -4 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr1 + xvmulwod.w.h xr13, xr7, xr1 + xvmaddwev.w.h xr10, xr8, xr2 + xvmaddwod.w.h xr11, xr8, xr2 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr1, xr13, 0x4E + xvilvl.b xr17, xr1, xr13 + vstelm.d vr17, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 2 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 3 + add.d a0, a0, a1 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.q xr9, xr8, 0x01 + vadd.h vr8, vr8, vr9 + vsub.h vr8, vr8, vr21 + vssrarni.bu.h vr8, vr8, 2 + vstelm.d vr8, a6, 0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W8_LASX + b .END_W420_LASX + +.WMASK420_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -2 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr4, xr4, xr20 + xvmin.hu xr5, xr5, xr20 + xvsub.h xr6, xr20, xr4 + xvsub.h xr7, xr20, xr5 + xvmulwev.w.h xr8, xr4, xr0 + xvmulwod.w.h xr9, xr4, xr0 + xvmulwev.w.h xr10, xr5, xr1 + xvmulwod.w.h xr11, xr5, xr1 + xvmaddwev.w.h xr8, xr6, xr2 + xvmaddwod.w.h xr9, xr6, xr2 + xvmaddwev.w.h xr10, xr7, xr3 + xvmaddwod.w.h xr11, xr7, xr3 + xvssrarni.hu.w xr10, xr8, 10 + xvssrarni.hu.w xr11, xr9, 10 + xvssrlni.bu.h xr11, xr10, 0 + xvshuf4i.w xr8, xr11, 0x4E + xvilvl.b xr15, xr8, xr11 + xvpermi.d xr16, xr15, 0xd8 + vst vr16, a0, 0 + add.d a0, a0, a1 + xvpermi.q xr16, xr16, 0x01 + vst vr16, a0, 0 + add.d a0, a0, a1 + + xvhaddw.w.h xr4, xr4, xr4 + xvhaddw.w.h xr5, xr5, xr5 + xvadd.h xr4, xr5, xr4 + xvpickev.h xr6, xr4, xr4 + xvpermi.d xr7, xr6, 0x08 + vsub.h vr7, vr7, vr21 + vssrarni.bu.h vr7, vr7, 2 + vstelm.d vr7, a6, 0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W16_LASX + b .END_W420_LASX + +.WMASK420_W32_LASX: +.WMASK420_W64_LASX: +.WMASK420_W128_LASX: + +.LOOP_W32_420_LASX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 +.W32_420_LASX: + xvld xr0, t1, 0 + xvld xr1, t2, 0 + xvld xr2, t5, 0 + xvld xr3, t6, 0 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + xvabsd.h xr4, xr0, xr1 + xvabsd.h xr5, xr2, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr2 + xvmulwod.w.h xr13, xr7, xr2 + xvmaddwev.w.h xr10, xr8, xr1 + xvmaddwod.w.h xr11, xr8, xr1 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr10, xr13, 0x4E + xvilvl.b xr17, xr10, xr13 + xvpermi.d xr18, xr17, 0x08 + xvpermi.d xr19, xr17, 0x0d + vst vr18, t3, 0 + vstx vr19, t3, a1 + addi.d t3, t3, 16 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvadd.h xr6, xr7, xr6 + xvpickev.h xr7, xr6, xr6 + xvpermi.d xr8, xr7, 0x08 + vsub.h vr9, vr8, vr21 + vssrarni.bu.h vr9, vr9, 2 + vstelm.d vr9, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LASX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LASX + +.END_W420_LASX: +endfunc + +#undef bpc_sh +#undef bpcw_sh + +.macro vhaddw.d.h in0 + vhaddw.w.h \in0, \in0, \in0 + vhaddw.d.w \in0, \in0, \in0 +.endm +.macro vhaddw.q.w in0 + vhaddw.d.w \in0, \in0, \in0 + vhaddw.q.d \in0, \in0, \in0 +.endm +.macro PUT_H_8W in0 + vbsrl.v vr2, \in0, 1 + vbsrl.v vr3, \in0, 2 + vbsrl.v vr4, \in0, 3 + vbsrl.v vr5, \in0, 4 + vbsrl.v vr6, \in0, 5 + vbsrl.v vr7, \in0, 6 + vbsrl.v vr10, \in0, 7 + vilvl.d vr2, vr2, \in0 + vilvl.d vr3, vr4, vr3 + vilvl.d vr4, vr6, vr5 + vilvl.d vr5, vr10, vr7 + vdp2.h.bu.b \in0, vr2, vr8 + vdp2.h.bu.b vr2, vr3, vr8 + vdp2.h.bu.b vr3, vr4, vr8 + vdp2.h.bu.b vr4, vr5, vr8 + vhaddw.d.h \in0 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vhaddw.d.h vr4 + vpickev.w \in0, vr2, \in0 + vpickev.w vr2, vr4, vr3 + vpickev.h \in0, vr2, \in0 + vadd.h \in0, \in0, vr9 +.endm +.macro FILTER_8TAP_4W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vdp2.h.bu.b vr7, vr10, vr8 + vdp2.h.bu.b vr10, vr11, vr8 + vhaddw.d.h vr7 + vhaddw.d.h vr10 + vpickev.w \in0, vr10, vr7 +.endm +.macro FILTER_8TAP_8W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vbsrl.v vr13, \in0, 4 + vbsrl.v vr14, \in0, 5 + vbsrl.v vr15, \in0, 6 + vbsrl.v vr16, \in0, 7 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vilvl.d vr12, vr14, vr13 + vilvl.d vr13, vr16, vr15 + vdp2.h.bu.b vr14, vr10, vr8 + vdp2.h.bu.b vr15, vr11, vr8 + vdp2.h.bu.b vr16, vr12, vr8 + vdp2.h.bu.b vr17, vr13, vr8 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + vhaddw.d.h vr17 + vpickev.w vr13, vr15, vr14 + vpickev.w vr14, vr17, vr16 + vpickev.h \in0, vr14, vr13 //x0 ... x7 + vsrari.h \in0, \in0, 2 +.endm +.macro FILTER_8TAP_8W_CLIP_STORE + vdp2.w.h vr12, vr0, vr9 + vdp2.w.h vr13, vr1, vr9 + vdp2.w.h vr14, vr2, vr9 + vdp2.w.h vr15, vr3, vr9 + vdp2.w.h vr16, vr4, vr9 + vdp2.w.h vr17, vr5, vr9 + vdp2.w.h vr18, vr6, vr9 + vdp2.w.h vr19, vr7, vr9 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vhaddw.q.w vr15 + vhaddw.q.w vr16 + vhaddw.q.w vr17 + vhaddw.q.w vr18 + vhaddw.q.w vr19 + vpackev.w vr12, vr13, vr12 + vpackev.w vr13, vr15, vr14 + vpackev.d vr12, vr13, vr12 + vpackev.w vr14, vr17, vr16 + vpackev.w vr15, vr19, vr18 + vpackev.d vr13, vr15, vr14 + vssrarni.hu.w vr13, vr12, 10 + vssrani.bu.h vr13, vr13, 0 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a1 +.endm +.macro VEXTRINS_Hx8 in0 + vextrins.h vr0, \in0, 0x70 + vextrins.h vr1, \in0, 0x71 + vextrins.h vr2, \in0, 0x72 + vextrins.h vr3, \in0, 0x73 + vextrins.h vr4, \in0, 0x74 + vextrins.h vr5, \in0, 0x75 + vextrins.h vr6, \in0, 0x76 + vextrins.h vr7, \in0, 0x77 +.endm +.macro VBSRL_Vx8 + vbsrl.v vr0, vr0, 2 + vbsrl.v vr1, vr1, 2 + vbsrl.v vr2, vr2, 2 + vbsrl.v vr3, vr3, 2 + vbsrl.v vr4, vr4, 2 + vbsrl.v vr5, vr5, 2 + vbsrl.v vr6, vr6, 2 + vbsrl.v vr7, vr7, 2 +.endm + +.macro PUT_8TAP_8BPC_LSX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + slli.d t2, a3, 1 //src_stride*2 + add.d t3, t2, a3 //src_stride*3 + slli.d t4, t2, 1 //src_stride*4 + + bnez a6, .l_\lable\()put_h //mx + bnez a7, .l_\lable\()put_v //my + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv0_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv0_jtable: + .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable + +.l_\lable\()put_hv0_2w: + vldrepl.h vr0, a2, 0 + add.d a2, a2, a3 + vldrepl.h vr1, a2, 0 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr1, a0, 0, 0 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_2w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_4w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_8w: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_8w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_16w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vst vr0, a0, 0 + vstx vr1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_16w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_32w: + vld vr0, a2, 0 + vld vr1, a2, 16 + add.d a2, a2, a3 + vld vr2, a2, 0 + vld vr3, a2, 16 + vst vr0, a0, 0 + vst vr1, a0, 16 + add.d a0, a0, a1 + vst vr2, a0, 0 + vst vr3, a0, 16 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_32w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_64w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + add.d a2, a2, a3 + vld vr4, a2, 0 + vld vr5, a2, 16 + vld vr6, a2, 32 + vld vr7, a2, 48 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + add.d a0, a0, a1 + vst vr4, a0, 0 + vst vr5, a0, 16 + vst vr6, a0, 32 + vst vr7, a0, 48 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_64w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_128w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + vld vr4, a2, 64 + vld vr5, a2, 80 + vld vr6, a2, 96 + vld vr7, a2, 112 + add.d a2, a2, a3 + vld vr8, a2, 0 + vld vr9, a2, 16 + vld vr10, a2, 32 + vld vr11, a2, 48 + vld vr12, a2, 64 + vld vr13, a2, 80 + vld vr14, a2, 96 + vld vr15, a2, 112 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + vst vr4, a0, 64 + vst vr5, a0, 80 + vst vr6, a0, 96 + vst vr7, a0, 112 + add.d a0, a0, a1 + vst vr8, a0, 0 + vst vr9, a0, 16 + vst vr10, a0, 32 + vst vr11, a0, 48 + vst vr12, a0, 64 + vst vr13, a0, 80 + vst vr14, a0, 96 + vst vr15, a0, 112 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_128w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h: + bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_h_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + addi.d a2, a2, -3 + li.w t1, 34 + vreplgr2vr.h vr9, t1 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_h_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_h_jtable: + .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable + +.l_\lable\()put_h_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vilvl.d vr0, vr2, vr0 + vdp2.h.bu.b vr2, vr0, vr8 + vhaddw.w.h vr0, vr2, vr2 + vhaddw.d.w vr0, vr0, vr0 + vbsrl.v vr2, vr1, 1 + vilvl.d vr1, vr2, vr1 + vdp2.h.bu.b vr2, vr1, vr8 + vhaddw.w.h vr1, vr2, vr2 + vhaddw.d.w vr1, vr1, vr1 + vpickev.w vr0, vr1, vr0 + vpickev.h vr0, vr0, vr0 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_2w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vbsrl.v vr3, vr0, 2 + vbsrl.v vr4, vr0, 3 + vilvl.d vr0, vr2, vr0 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr5, vr2, vr0 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + vbsrl.v vr4, vr1, 3 + vilvl.d vr0, vr2, vr1 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr6, vr2, vr0 + vpickev.h vr0, vr6, vr5 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.w vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_4w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_8w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_8w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_16w: +.l_\lable\()put_h_32w: +.l_\lable\()put_h_64w: +.l_\lable\()put_h_128w: + addi.d t0, a2, 0 //src + addi.w t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_h_16w_loop: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_16w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.w a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_h_16w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v: + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_v_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_v_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr8, t1, 0 + sub.d a2, a2, t3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_v_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_v_jtable: + .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable + +.l_\lable\()put_v_2w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr0, vr1, vr0 + +.l_\lable\()put_v_2w_loop: + fld.s f7, a2, 0 //h0 + fldx.s f10, a2, a3 //h1 + add.d a2, a2, t2 + + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vbsrl.v vr1, vr0, 1 + vextrins.b vr1, vr10, 0x70 + vextrins.b vr1, vr10, 0xf1 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vbsrl.v vr0, vr1, 1 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vpickev.w vr10, vr11, vr10 + vssrarni.hu.w vr10, vr10, 6 + vssrani.bu.h vr10, vr10, 0 + + vstelm.h vr10, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr10, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr2, vr1, vr0 + vilvh.w vr3, vr1, vr0 + +.l_\lable\()put_v_4w_loop: + fld.s f7, a2, 0 + fldx.s f10, a2, a3 + add.d a2, a2, t2 + + vextrins.b vr2, vr7, 0x70 + vextrins.b vr2, vr7, 0xf1 //x0x1(h0) + vbsrl.v vr4, vr2, 1 + vextrins.b vr4, vr10, 0x70 + vextrins.b vr4, vr10, 0xf1 //x0x1(h1) + vdp2.h.bu.b vr11, vr2, vr8 + vdp2.h.bu.b vr12, vr4, vr8 + vbsrl.v vr2, vr4, 1 + + vextrins.b vr3, vr7, 0x72 + vextrins.b vr3, vr7, 0xf3 //x2x3(h0) + vbsrl.v vr4, vr3, 1 + vextrins.b vr4, vr10, 0x72 + vextrins.b vr4, vr10, 0xf3 //x2x3(h1) + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vbsrl.v vr3, vr4, 1 + + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + + vpickev.w vr11, vr13, vr11 + vpickev.w vr12, vr14, vr12 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + vstelm.w vr11, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr11, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_8w: +.l_\lable\()put_v_16w: +.l_\lable\()put_v_32w: +.l_\lable\()put_v_64w: +.l_\lable\()put_v_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_v_8w_loop0: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t2 + add.d a2, a2, t3 + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + fldx.d f5, a2, t2 + fldx.d f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr6, vr3, vr2 + vilvh.h vr7, vr3, vr2 + vilvl.w vr0, vr6, vr4 // x0x1 + vilvh.w vr1, vr6, vr4 // x2x3 + vilvl.w vr2, vr7, vr5 // x4x5 + vilvh.w vr3, vr7, vr5 // x6x7 +.l_\lable\()put_v_8w_loop: + fld.d f7, a2, 0 + fldx.d f10, a2, a3 + add.d a2, a2, t2 + //h0 + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vextrins.b vr1, vr7, 0x72 + vextrins.b vr1, vr7, 0xf3 + vextrins.b vr2, vr7, 0x74 + vextrins.b vr2, vr7, 0xf5 + vextrins.b vr3, vr7, 0x76 + vextrins.b vr3, vr7, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + //h1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + vextrins.b vr0, vr10, 0x70 + vextrins.b vr0, vr10, 0xf1 + vextrins.b vr1, vr10, 0x72 + vextrins.b vr1, vr10, 0xf3 + vextrins.b vr2, vr10, 0x74 + vextrins.b vr2, vr10, 0xf5 + vextrins.b vr3, vr10, 0x76 + vextrins.b vr3, vr10, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_v_8w_loop0 + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv: + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_hv_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_hv_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr9, t1, 0 + vexth.h.b vr9, vr9 + + sub.d a2, a2, t3 + addi.d a2, a2, -3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv_jtable: + .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable + +.l_\lable\()put_hv_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + + vbsrl.v vr10, vr0, 1 + vbsrl.v vr11, vr1, 1 + vbsrl.v vr12, vr2, 1 + vbsrl.v vr13, vr3, 1 + vbsrl.v vr14, vr4, 1 + vbsrl.v vr15, vr5, 1 + vbsrl.v vr16, vr6, 1 + vilvl.d vr0, vr10, vr0 + vilvl.d vr1, vr11, vr1 + vilvl.d vr2, vr12, vr2 + vilvl.d vr3, vr13, vr3 + vilvl.d vr4, vr14, vr4 + vilvl.d vr5, vr15, vr5 + vilvl.d vr6, vr16, vr6 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vdp2.h.bu.b vr12, vr2, vr8 + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vdp2.h.bu.b vr15, vr5, vr8 + vdp2.h.bu.b vr16, vr6, vr8 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + + vpackev.w vr10, vr11, vr10 + vpackev.w vr12, vr13, vr12 + vpackod.d vr11, vr12, vr10 + vpackev.d vr10, vr12, vr10 + + vpackev.w vr12, vr15, vr14 + vpackev.w vr16, vr17, vr16 + vpackod.d vr13, vr16, vr12 + vpackev.d vr12, vr16, vr12 + + vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0) + vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1) + vsrari.h vr10, vr10, 2 + vsrari.h vr11, vr11, 2 +.l_\lable\()put_hv_2w_loop: + vld vr7, a2, 0 + vldx vr12, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr1, vr7, 1 + vbsrl.v vr2, vr12, 1 + vilvl.d vr0, vr1, vr7 + vilvl.d vr1, vr2, vr12 + vdp2.h.bu.b vr2, vr0, vr8 + vdp2.h.bu.b vr3, vr1, vr8 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vpickev.w vr2, vr3, vr2 + vpickev.h vr2, vr2, vr2 + vsrari.h vr2, vr2, 2 + vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7 + vextrins.h vr11, vr2, 0x71 + vbsrl.v vr12, vr10, 2 + vbsrl.v vr13, vr11, 2 + vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8 + vextrins.h vr13, vr2, 0x73 + vdp2.w.h vr0, vr10, vr9 + vdp2.w.h vr1, vr11, vr9 + vdp2.w.h vr2, vr12, vr9 + vdp2.w.h vr3, vr13, vr9 + vhaddw.q.w vr0 + vhaddw.q.w vr1 + vhaddw.q.w vr2 + vhaddw.q.w vr3 + vpackev.w vr0, vr1, vr0 + vpackev.w vr1, vr3, vr2 + vpackev.d vr0, vr1, vr0 + vssrarni.hu.w vr0, vr0, 10 + vssrani.bu.h vr0, vr0, 0 + vbsrl.v vr10, vr12, 2 + vbsrl.v vr11, vr13, 2 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_hv_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_4W vr0 //x0 x1 x2 x3 + FILTER_8TAP_4W vr1 + FILTER_8TAP_4W vr2 + FILTER_8TAP_4W vr3 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + FILTER_8TAP_4W vr6 + vpackev.h vr0, vr1, vr0 + vpackev.h vr1, vr3, vr2 + vpackev.h vr2, vr5, vr4 + vpackev.h vr3, vr7, vr6 + vilvl.w vr4, vr1, vr0 + vilvh.w vr5, vr1, vr0 + vilvl.w vr6, vr3, vr2 + vilvh.w vr7, vr3, vr2 + vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 * + vilvh.d vr1, vr6, vr4 + vilvl.d vr2, vr7, vr5 + vilvh.d vr3, vr7, vr5 + vsrari.h vr0, vr0, 2 + vsrari.h vr1, vr1, 2 + vsrari.h vr2, vr2, 2 + vsrari.h vr3, vr3, 2 +.l_\lable\()put_hv_4w_loop: + vld vr4, a2, 0 + vldx vr5, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + vpickev.h vr4, vr5, vr4 + vsrari.h vr4, vr4, 2 + vextrins.h vr0, vr4, 0x70 + vextrins.h vr1, vr4, 0x71 + vextrins.h vr2, vr4, 0x72 + vextrins.h vr3, vr4, 0x73 + vbsrl.v vr5, vr0, 2 + vbsrl.v vr6, vr1, 2 + vbsrl.v vr7, vr2, 2 + vbsrl.v vr10, vr3, 2 + vextrins.h vr5, vr4, 0x74 + vextrins.h vr6, vr4, 0x75 + vextrins.h vr7, vr4, 0x76 + vextrins.h vr10, vr4, 0x77 + vdp2.w.h vr11, vr0, vr9 + vdp2.w.h vr12, vr1, vr9 + vdp2.w.h vr13, vr2, vr9 + vdp2.w.h vr14, vr3, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr0, vr12, vr11 + vpackev.w vr1, vr14, vr13 + vpackev.d vr0, vr1, vr0 + vdp2.w.h vr11, vr5, vr9 + vdp2.w.h vr12, vr6, vr9 + vdp2.w.h vr13, vr7, vr9 + vdp2.w.h vr14, vr10, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr1, vr12, vr11 + vpackev.w vr2, vr14, vr13 + vpackev.d vr1, vr2, vr1 + vssrarni.hu.w vr1, vr0, 10 + vssrani.bu.h vr1, vr1, 0 + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + add.d a0, a0, a1 + vbsrl.v vr0, vr5, 2 + vbsrl.v vr1, vr6, 2 + vbsrl.v vr2, vr7, 2 + vbsrl.v vr3, vr10, 2 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_8w: +.l_\lable\()put_hv_16w: +.l_\lable\()put_hv_32w: +.l_\lable\()put_hv_64w: +.l_\lable\()put_hv_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_hv_8w_loop0: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_8W vr0 + FILTER_8TAP_8W vr1 + FILTER_8TAP_8W vr2 + FILTER_8TAP_8W vr3 + FILTER_8TAP_8W vr4 + FILTER_8TAP_8W vr5 + FILTER_8TAP_8W vr6 + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17 +.l_\lable\()put_hv_8w_loop: + vld vr20, a2, 0 + vldx vr21, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_8W vr20 + FILTER_8TAP_8W vr21 + VEXTRINS_Hx8 vr20 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + VEXTRINS_Hx8 vr21 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_hv_8w_loop0 +.l_\lable\()end_put_8tap: +.endm + +function put_8tap_regular_8bpc_lsx + addi.d sp, sp, -16 + st.d zero, sp, 0 + PUT_8TAP_8BPC_LSX 0 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 1 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 1 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 2 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 2 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 4 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 4 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 5 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 5 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 6 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 6 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 8 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 8 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 9 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 9 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 10 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 10 + addi.d sp, sp, 16 +endfunc + +const shufb1 +.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 +endconst + +.macro SHUFB in0, in1, tmp, out + xvbsrl.v \tmp, \in0, 2 + xvpermi.q \tmp, \in0, 0x20 + xvshuf.b \out, \tmp, \tmp, \in1 +.endm + +.macro HADDWDH in0 + xvhaddw.w.h \in0, \in0, \in0 + xvhaddw.d.w \in0, \in0, \in0 +.endm + +.macro HADDWQW in0 + xvhaddw.d.w \in0, \in0, \in0 + xvhaddw.q.d \in0, \in0, \in0 +.endm + +.macro PREP_W16_H in0 + xvbsrl.v xr4, \in0, 4 + xvbsrl.v xr5, \in0, 8 + xvpermi.q xr9, \in0, 0x31 + xvpackev.d xr5, xr9, xr5 + xvbsrl.v xr6, xr5, 4 + SHUFB \in0, xr23, xr9, \in0 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + xvdp2.h.bu.b xr10, \in0, xr22 + xvdp2.h.bu.b xr11, xr4, xr22 + xvdp2.h.bu.b xr12, xr5, xr22 + xvdp2.h.bu.b xr13, xr6, xr22 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h \in0, xr10, 2 +.endm + +.macro PREP_8TAP_8BPC_LASX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + la.local t7, shufb1 + xvld xr23, t7, 0 + slli.d t2, a2, 1 //src_stride*2 + add.d t3, t2, a2 //src_stride*3 + slli.d t4, t2, 1 + + bnez a5, .l_\lable\()h //mx + bnez a6, .l_\lable\()v + + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_hv0_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_hv0_jtable: + .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable + +.l_\lable\()hv0_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + fldx.s f3, a1, t3 + add.d a1, a1, t4 + xvpackev.w xr0, xr1, xr0 + xvpackev.w xr1, xr3, xr2 + xvpermi.q xr0, xr1, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_4w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_8w: + fld.d f0, a1, 0 + fldx.d f1, a1, a2 + fldx.d f2, a1, t2 + fldx.d f3, a1, t3 + add.d a1, a1, t4 + xvpermi.q xr0, xr1, 0x02 + xvpermi.q xr2, xr3, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvsllwil.hu.bu xr2, xr2, 4 + xvst xr0, a0, 0 + xvst xr2, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_8w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_16w: + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t2 + vldx vr3, a1, t3 + add.d a1, a1, t4 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + xvslli.h xr0, xr0, 4 + xvslli.h xr1, xr1, 4 + xvslli.h xr2, xr2, 4 + xvslli.h xr3, xr3, 4 + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + addi.d a0, a0, 128 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_16w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_32w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + xvst xr1, a0, 64 + xvst xr5, a0, 96 + xvst xr2, a0, 128 + xvst xr6, a0, 160 + xvst xr3, a0, 192 + xvst xr7, a0, 224 + addi.d a0, a0, 256 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_64w: +.l_\lable\()hv0_128w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 5 + slli.w t7, t7, 6 + addi.d t8, a0, 0 +.l_\lable\()hv0_32_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + add.d t1, a0, t7 + xvst xr1, t1, 0 + xvst xr5, t1, 32 + add.d t1, t1, t7 + xvst xr2, t1, 0 + xvst xr6, t1, 32 + add.d t1, t1, t7 + xvst xr3, t1, 0 + xvst xr7, t1, 32 + add.d a0, t1, t7 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32_loop + addi.d a1, t0, 32 + addi.d t0, t0, 32 + addi.d a0, t8, 64 + addi.d t8, t8, 64 + addi.d a4, t5, 0 + addi.d a3, a3, -32 + bnez a3, .l_\lable\()hv0_32_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()h: + bnez a6, .l_\lable\()hv //if(fh) && if (fv) + + andi t1, a7, 3 + blt t0, a3, .l_\lable\()h_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + + addi.d a1, a1, -3 + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_h_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_h_jtable: + .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable + +.l_\lable\()h_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpickev.w xr10, xr12, xr10 + xvpickev.w xr14, xr16, xr14 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpickev.h xr10, xr14, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_4w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_8w: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + add.d a1, a1, t4 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr10 + SHUFB xr1, xr23, xr9, xr11 + SHUFB xr2, xr23, xr9, xr12 + SHUFB xr3, xr23, xr9, xr13 + SHUFB xr4, xr23, xr9, xr14 + SHUFB xr5, xr23, xr9, xr15 + SHUFB xr6, xr23, xr9, xr16 + SHUFB xr7, xr23, xr9, xr17 + + xvdp2.h.bu.b xr0, xr10, xr22 + xvdp2.h.bu.b xr1, xr11, xr22 + xvdp2.h.bu.b xr2, xr12, xr22 + xvdp2.h.bu.b xr3, xr13, xr22 + xvdp2.h.bu.b xr4, xr14, xr22 + xvdp2.h.bu.b xr5, xr15, xr22 + xvdp2.h.bu.b xr6, xr16, xr22 + xvdp2.h.bu.b xr7, xr17, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpickev.w xr0, xr1, xr0 + xvpickev.w xr2, xr3, xr2 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr2, xr2, 0xd8 + xvpickev.h xr0, xr2, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvsrari.h xr0, xr0, 2 + + xvpickev.w xr4, xr5, xr4 + xvpickev.w xr6, xr7, xr6 + xvpermi.d xr4, xr4, 0xd8 + xvpermi.d xr6, xr6, 0xd8 + xvpickev.h xr4, xr6, xr4 + xvpermi.d xr4, xr4, 0xd8 + xvsrari.h xr4, xr4, 2 + + xvst xr0, a0, 0 + xvst xr4, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_8w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_16w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + + addi.d a0, a0, 128 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_16w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_32w: +.l_\lable\()h_64w: +.l_\lable\()h_128w: + addi.d t0, a1, 0 //src + addi.d t5, a4, 0 //h + srli.w t7, a3, 4 //w + slli.w t7, t7, 5 //store offset + addi.d t8, a0, 0 //dst +.l_\lable\()h_16_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvstx xr1, a0, t7 + slli.w t1, t7, 1 + xvstx xr2, a0, t1 + add.w t1, t1, t7 + xvstx xr3, a0, t1 + slli.w t1, t7, 2 + add.d a0, a0, t1 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_16_loop + + addi.d a1, t0, 16 + addi.d t0, t0, 16 + addi.d a0, t8, 32 + addi.d t8, t8, 32 + addi.d a4, t5, 0 + addi.d a3, a3, -16 + bnez a3, .l_\lable\()h_16_loop + b .l_\lable\()end_pre_8tap +.l_\lable\()hv: + andi t1, a7, 3 + blt t0, a3, .l_\lable\()hv_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()hv_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()hv_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + xvsllwil.h.b xr8, xr8, 0 + + sub.d a1, a1, t3 + addi.d a1, a1, -3 + beq a3, t0, .l_\lable\()hv_4w + b .l_\lable\()hv_8w +.l_\lable\()hv_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvld xr4, a1, 0 + xvldx xr5, a1, a2 + xvldx xr6, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr11 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr12 //h2 + HADDWDH xr13 //h3 + + xvpackev.w xr10, xr11, xr10 + xvpackev.w xr12, xr13, xr12 + xvpackev.d xr11, xr12, xr10 + xvpackod.d xr10, xr12, xr10 + xvpickev.h xr11, xr10, xr11 + xvsrari.h xr11, xr11, 2 + + HADDWDH xr14 //h4 + HADDWDH xr15 //h5 + HADDWDH xr16 //h6 + + xvpackev.w xr14, xr15, xr14 + xvpackev.w xr16, xr17, xr16 + xvpackev.d xr17, xr16, xr14 + xvpackod.d xr14, xr16, xr14 + xvpickev.h xr13, xr14, xr17 + xvsrari.h xr13, xr13, 2 + + xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 * + xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 * +.l_\lable\()hv_w4_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr1, a1, 0 + xvldx xr2, a1, a2 + xvldx xr3, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpackev.w xr10, xr12, xr10 + xvpackev.w xr14, xr16, xr14 + xvpackev.d xr12, xr14, xr10 + xvpackod.d xr10, xr14, xr10 + xvpickev.h xr12, xr10, xr12 + xvsrari.h xr12, xr12, 2 + + xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2) + xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3) + + xvdp2.w.h xr0, xr18, xr8 + xvdp2.w.h xr2, xr19, xr8 + HADDWQW xr0 + HADDWQW xr2 + xvpackev.w xr0, xr2, xr0 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x71 + xvextrins.h xr19, xr12, 0x75 + xvdp2.w.h xr2, xr18, xr8 + xvdp2.w.h xr4, xr19, xr8 + HADDWQW xr2 + HADDWQW xr4 + xvpackev.w xr2, xr4, xr2 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x72 + xvextrins.h xr19, xr12, 0x76 + xvdp2.w.h xr4, xr18, xr8 + xvdp2.w.h xr9, xr19, xr8 + HADDWQW xr4 + HADDWQW xr9 + xvpackev.w xr4, xr9, xr4 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x73 + xvextrins.h xr19, xr12, 0x77 + xvdp2.w.h xr9, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + HADDWQW xr9 + HADDWQW xr11 + xvpackev.w xr9, xr11, xr9 + + xvpackev.d xr0, xr2, xr0 + xvpackev.d xr4, xr9, xr4 + xvsrari.w xr0, xr0, 6 + xvsrari.w xr4, xr4, 6 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr4, xr4, 0xd8 + xvpickev.h xr0, xr4, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_w4_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()hv_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 3 + slli.w t7, t7, 4 // store offset + addi.d t8, a0, 0 +.l_\lable\()hv_8w_loop0: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + + add.d a1, a1, t4 + xvld xr10, a1, 0 + xvldx xr11, a1, a2 + xvldx xr12, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr13 + SHUFB xr1, xr23, xr9, xr14 + SHUFB xr2, xr23, xr9, xr15 + SHUFB xr3, xr23, xr9, xr16 + SHUFB xr4, xr23, xr9, xr17 + SHUFB xr5, xr23, xr9, xr18 + SHUFB xr6, xr23, xr9, xr19 + SHUFB xr7, xr23, xr9, xr20 + + xvdp2.h.bu.b xr0, xr13, xr22 + xvdp2.h.bu.b xr1, xr14, xr22 + xvdp2.h.bu.b xr2, xr15, xr22 + xvdp2.h.bu.b xr3, xr16, xr22 + xvdp2.h.bu.b xr4, xr17, xr22 + xvdp2.h.bu.b xr5, xr18, xr22 + xvdp2.h.bu.b xr6, xr19, xr22 + xvdp2.h.bu.b xr7, xr20, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpackev.w xr0, xr2, xr0 + xvpackev.w xr2, xr6, xr4 + xvpackev.d xr16, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr16 + xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 + + xvpackev.w xr1, xr3, xr1 + xvpackev.w xr3, xr7, xr5 + xvpackev.d xr16, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr16 + xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 + + xvbsrl.v xr13, xr10, 4 + xvbsrl.v xr14, xr11, 4 + xvbsrl.v xr15, xr12, 4 + + SHUFB xr10, xr23, xr9, xr10 + SHUFB xr13, xr23, xr9, xr13 + SHUFB xr11, xr23, xr9, xr11 + SHUFB xr14, xr23, xr9, xr14 + SHUFB xr12, xr23, xr9, xr12 + SHUFB xr15, xr23, xr9, xr15 + + xvdp2.h.bu.b xr4, xr10, xr22 + xvdp2.h.bu.b xr5, xr13, xr22 + xvdp2.h.bu.b xr6, xr11, xr22 + xvdp2.h.bu.b xr7, xr14, xr22 + xvdp2.h.bu.b xr9, xr12, xr22 + xvdp2.h.bu.b xr10, xr15, xr22 + + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + HADDWDH xr9 + HADDWDH xr10 + + xvpackev.w xr4, xr6, xr4 + xvpackev.w xr9, xr12, xr9 + xvpackev.d xr16, xr9, xr4 + xvpackod.d xr11, xr9, xr4 + xvpickev.h xr2, xr11, xr16 + xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 * + + xvpackev.w xr5, xr7, xr5 + xvpackev.w xr10, xr12, xr10 + xvpackev.d xr16, xr10, xr5 + xvpackod.d xr11, xr10, xr5 + xvpickev.h xr3, xr11, xr16 + xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 * + + xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 * + xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 * + xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 * + xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 * + +.l_\lable\()hv_8w_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr2, a1, 0 + xvldx xr4, a1, a2 + xvldx xr6, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + SHUFB xr7, xr23, xr9, xr7 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + xvdp2.h.bu.b xr17, xr7, xr22 + + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + HADDWDH xr14 + HADDWDH xr15 + HADDWDH xr16 + HADDWDH xr17 + + xvpackev.w xr0, xr12, xr10 + xvpackev.w xr2, xr16, xr14 + xvpackev.d xr9, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr9 + xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83 + + xvpackev.w xr1, xr13, xr11 + xvpackev.w xr3, xr17, xr15 + xvpackev.d xr9, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr9 + xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87 + + xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58) + xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59) + xvextrins.h xr20, xr1, 0x70 + xvextrins.h xr21, xr1, 0x74 + + //h - 1 + xvdp2.w.h xr10, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + xvdp2.w.h xr12, xr20, xr8 + xvdp2.w.h xr13, xr21, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * * + xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * * + xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7 + //h - 2 + xvbsrl.v xr4, xr18, 2 + xvbsrl.v xr5, xr19, 2 + xvbsrl.v xr6, xr20, 2 + xvbsrl.v xr7, xr21, 2 + xvextrins.h xr4, xr0, 0x71 + xvextrins.h xr5, xr0, 0x75 + xvextrins.h xr6, xr1, 0x71 + xvextrins.h xr7, xr1, 0x75 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr14, xr11, xr10 + xvpackev.w xr15, xr13, xr12 + xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15 + //h - 3 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x72 + xvextrins.h xr5, xr0, 0x76 + xvextrins.h xr6, xr1, 0x72 + xvextrins.h xr7, xr1, 0x76 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr15, xr11, xr10 + xvpackev.w xr16, xr13, xr12 + xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23 + //h - 4 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x73 + xvextrins.h xr5, xr0, 0x77 + xvextrins.h xr6, xr1, 0x73 + xvextrins.h xr7, xr1, 0x77 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr16, xr11, xr10 + xvpackev.w xr17, xr13, xr12 + xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31 + + xvsrari.w xr2, xr2, 6 + xvsrari.w xr14, xr14, 6 + xvsrari.w xr15, xr15, 6 + xvsrari.w xr16, xr16, 6 + + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpermi.d xr15, xr15, 0xd8 + xvpermi.d xr16, xr16, 0xd8 + xvpickev.h xr2, xr14, xr2 + xvpickev.h xr3, xr16, xr15 + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr3, xr3, 0xd8 + + xvpermi.q xr10, xr2, 0x31 + xvpermi.q xr11, xr3, 0x31 + + vst vr2, a0, 0 + vstx vr10, a0, t7 //32 + slli.w t1, t7, 1 //64 + vstx vr3, a0, t1 + add.w t1, t1, t7 //96 + vstx vr11, a0, t1 + slli.w t1, t7, 2 //128 + add.d a0, a0, t1 + + xvbsrl.v xr18, xr4, 2 + xvbsrl.v xr19, xr5, 2 + xvbsrl.v xr20, xr6, 2 + xvbsrl.v xr21, xr7, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_8w_loop + + addi.d a1, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 16 + addi.d t8, t8, 16 + addi.d a4, t5, 0 + addi.d a3, a3, -8 + bnez a3, .l_\lable\()hv_8w_loop0 + b .l_\lable\()end_pre_8tap +.l_\lable\()v: + + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()v_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()v_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + + sub.d a1, a1, t3 + beq a3, t0, .l_\lable\()v_4w + blt t0, a3, .l_\lable\()v_8w +.l_\lable\()v_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_4w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_4w_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()v_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 2 + slli.w t7, t7, 3 + addi.d t8, a0, 0 +.l_\lable\()v_8w_loop0: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_8w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvstelm.d xr10, a0, 0, 0 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 1 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 2 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 3 + add.d a0, a0, t7 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_8w_loop + + addi.d a1, t0, 4 + addi.d t0, t0, 4 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a4, t5, 0 + addi.d a3, a3, -4 + bnez a3, .l_\lable\()v_8w_loop0 + +.l_\lable\()end_pre_8tap: +.endm + +function prep_8tap_regular_8bpc_lasx + addi.w a7, zero, 0 + PREP_8TAP_8BPC_LASX 0 +endfunc + +function prep_8tap_smooth_regular_8bpc_lasx + addi.w a7, zero, 1 + PREP_8TAP_8BPC_LASX 1 +endfunc + +function prep_8tap_sharp_regular_8bpc_lasx + addi.w a7, zero, 2 + PREP_8TAP_8BPC_LASX 2 +endfunc + +function prep_8tap_regular_smooth_8bpc_lasx + addi.w a7, zero, 4 + PREP_8TAP_8BPC_LASX 4 +endfunc + +function prep_8tap_smooth_8bpc_lasx + addi.w a7, zero, 5 + PREP_8TAP_8BPC_LASX 5 +endfunc + +function prep_8tap_sharp_smooth_8bpc_lasx + addi.w a7, zero, 6 + PREP_8TAP_8BPC_LASX 6 +endfunc + +function prep_8tap_regular_sharp_8bpc_lasx + addi.w a7, zero, 8 + PREP_8TAP_8BPC_LASX 8 +endfunc + +function prep_8tap_smooth_sharp_8bpc_lasx + addi.w a7, zero, 9 + PREP_8TAP_8BPC_LASX 9 +endfunc + +function prep_8tap_sharp_8bpc_lasx + addi.w a7, zero, 10 + PREP_8TAP_8BPC_LASX 10 +endfunc |