/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" /* static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, const int16_t *const abcd, int mx, int my HIGHBD_DECL_SUFFIX) */ .macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3 vbsrl.v vr2, \in0, \in1 vbsrl.v vr20, \in0, \in2 addi.w t4, \in3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr1, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr29, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] vilvl.d vr2, vr20, vr2 vilvl.d vr1, vr29, vr1 vmulwev.h.bu.b vr3, vr2, vr1 vmulwod.h.bu.b vr20, vr2, vr1 vilvl.d vr2, vr20, vr3 vhaddw.w.h vr2, vr2, vr2 vhaddw.d.w vr2, vr2, vr2 vhaddw.q.d vr2, vr2, vr2 vilvh.d vr3, vr20, vr3 vhaddw.w.h vr3, vr3, vr3 vhaddw.d.w vr3, vr3, vr3 vhaddw.q.d vr3, vr3, vr3 vextrins.w \out0, vr2, \out1 vextrins.w \out2, vr3, \out3 .endm .macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1 add.w \in0, \in0, \in1 addi.w t6, \in0, 512 srai.w t6, t6, 10 addi.w t6, t6, 64 slli.w t6, t6, 3 fldx.d f1, t5, t6 vsllwil.h.b vr1, vr1, 0 vmulwev.w.h vr3, \in2, vr1 vmaddwod.w.h vr3, \in2, vr1 vhaddw.d.w vr3, vr3, vr3 vhaddw.q.d vr3, vr3, vr3 vextrins.w \out0, vr3, \out1 .endm const warp_sh .rept 2 .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 .endr .rept 2 .byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .endr endconst .macro warp_lsx t, shift function warp_affine_8x8\t\()_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 la.local t4, warp_sh ld.h t0, a4, 0 // abcd[0] ld.h t1, a4, 2 // abcd[1] alsl.w t2, a3, a3, 1 addi.w t3, a5, 0 la.local t5, dav1d_mc_warp_filter sub.d a2, a2, t2 addi.d a2, a2, -3 vld vr0, a2, 0 vld vr30, t4, 0 vld vr31, t4, 32 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 add.w a5, t1, a5 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30 vsrarni.h.w vr12, vr4, 3 vsrarni.h.w vr13, vr5, 3 vsrarni.h.w vr14, vr6, 3 vsrarni.h.w vr15, vr7, 3 vsrarni.h.w vr16, vr8, 3 vsrarni.h.w vr17, vr9, 3 vsrarni.h.w vr18, vr10, 3 vsrarni.h.w vr19, vr11, 3 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20 FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20 FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20 FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20 vsrarni.h.w vr21, vr4, 3 vsrarni.h.w vr22, vr5, 3 vsrarni.h.w vr23, vr6, 3 vsrarni.h.w vr24, vr7, 3 vsrarni.h.w vr25, vr8, 3 vsrarni.h.w vr26, vr9, 3 vsrarni.h.w vr27, vr10, 3 vsrarni.h.w vr28, vr11, 3 addi.w t2, a6, 0 // my ld.h t7, a4, 4 // abcd[2] ld.h t8, a4, 6 // abcd[3] .ifnb \t slli.d a1, a1, 1 .endif FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 .ifnb \t vssrarni.h.w vr5, vr4, \shift vst vr5, a0, 0 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fst.d f5, a0, 0 .endif vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 .ifnb \t vssrarni.h.w vr5, vr4, \shift vstx vr5, a0, a1 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fstx.d f5, a0, a1 .endif vaddi.bu vr31, vr31, 2 vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 alsl.d a0, a1, a0, 1 .ifnb \t vssrarni.h.w vr5, vr4, \shift vst vr5, a0, 0 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fst.d f5, a0, 0 .endif vaddi.bu vr31, vr31, 2 vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 .ifnb \t vssrarni.h.w vr5, vr4, \shift vstx vr5, a0, a1 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fstx.d f5, a0, a1 .endif vaddi.bu vr31, vr31, 2 vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 alsl.d a0, a1, a0, 1 .ifnb \t vssrarni.h.w vr5, vr4, \shift vst vr5, a0, 0 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fst.d f5, a0, 0 .endif vaddi.bu vr31, vr31, 2 vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 .ifnb \t vssrarni.h.w vr5, vr4, \shift vstx vr5, a0, a1 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fstx.d f5, a0, a1 .endif vaddi.bu vr31, vr31, 2 vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 vextrins.h vr30, vr31, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 alsl.d a0, a1, a0, 1 .ifnb \t vssrarni.h.w vr5, vr4, \shift vst vr5, a0, 0 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fst.d f5, a0, 0 .endif vshuf.b vr12, vr21, vr12, vr30 vshuf.b vr13, vr22, vr13, vr30 vshuf.b vr14, vr23, vr14, vr30 vshuf.b vr15, vr24, vr15, vr30 vshuf.b vr16, vr25, vr16, vr30 vshuf.b vr17, vr26, vr17, vr30 vshuf.b vr18, vr27, vr18, vr30 vshuf.b vr19, vr28, vr19, vr30 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 .ifnb \t vssrarni.h.w vr5, vr4, \shift vstx vr5, a0, a1 .else vssrarni.hu.w vr5, vr4, \shift vssrlni.bu.h vr5, vr5, 0 fstx.d f5, a0, a1 .endif fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .endm warp_lsx , 11 warp_lsx t, 7 .macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 xvshuf.b xr2, \in0, \in0, \in2 addi.w t4, \in1, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr3, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr4, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr5, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr6, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] xvinsve0.d xr3, xr5, 1 xvinsve0.d xr3, xr4, 2 xvinsve0.d xr3, xr6, 3 xvmulwev.h.bu.b xr4, xr2, xr3 xvmulwod.h.bu.b xr5, xr2, xr3 xvilvl.d xr2, xr5, xr4 xvilvh.d xr3, xr5, xr4 xvhaddw.w.h xr2, xr2, xr2 xvhaddw.w.h xr3, xr3, xr3 xvhaddw.d.w xr2, xr2, xr2 xvhaddw.d.w xr3, xr3, xr3 xvhaddw.q.d xr2, xr2, xr2 xvhaddw.q.d xr3, xr3, xr3 xvextrins.w \out0, xr2, \out1 xvextrins.w \out2, xr3, \out3 .endm .macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 add.w \in0, \in0, \in1 addi.w t6, \in0, 512 srai.w t6, t6, 10 addi.w t6, t6, 64 slli.w t6, t6, 3 fldx.d f1, t5, t6 add.w t2, t2, t7 addi.w t6, t2, 512 srai.w t6, t6, 10 addi.w t6, t6, 64 slli.w t6, t6, 3 fldx.d f2, t5, t6 vilvl.d vr0, vr2, vr1 vext2xv.h.b xr0, xr0 xvmulwev.w.h xr3, \in2, xr0 xvmaddwod.w.h xr3, \in2, xr0 xvhaddw.d.w xr3, xr3, xr3 xvhaddw.q.d xr3, xr3, xr3 xvextrins.w \out0, xr3, \out1 .endm const shuf0 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 .byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 endconst .macro warp_lasx t, shift function warp_affine_8x8\t\()_8bpc_lasx addi.d sp, sp, -16 ld.h t0, a4, 0 // abcd[0] ld.h t1, a4, 2 // abcd[1] fst.d f24, sp, 0 fst.d f25, sp, 8 alsl.w t2, a3, a3, 1 addi.w t3, a5, 0 la.local t4, warp_sh la.local t5, dav1d_mc_warp_filter sub.d a2, a2, t2 addi.d a2, a2, -3 vld vr0, a2, 0 xvld xr24, t4, 0 xvld xr25, t4, 32 la.local t2, shuf0 xvld xr1, t2, 0 xvpermi.q xr0, xr0, 0x00 xvaddi.bu xr9, xr1, 4 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 xvsrarni.h.w xr12, xr7, 3 xvsrarni.h.w xr13, xr8, 3 xvsrarni.h.w xr14, xr10, 3 xvsrarni.h.w xr15, xr11, 3 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 xvsrarni.h.w xr16, xr7, 3 xvsrarni.h.w xr17, xr8, 3 xvsrarni.h.w xr18, xr10, 3 xvsrarni.h.w xr19, xr11, 3 addi.w t2, a6, 0 // my ld.h t7, a4, 4 // abcd[2] ld.h t8, a4, 6 // abcd[3] .ifnb \t slli.d a1, a1, 1 .endif // y = 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, \shift xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif fld.d f24, sp, 0 fld.d f25, sp, 8 addi.d sp, sp, 16 endfunc .endm warp_lasx , 11 warp_lasx t, 7 /* static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const int weight HIGHBD_DECL_SUFFIX) */ #define bpc8_sh 5 // sh = intermediate_bits + 1 #define bpcw8_sh 8 // sh = intermediate_bits + 4 #define bpc_sh bpc8_sh #define bpcw_sh bpcw8_sh function avg_8bpc_lsx addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .AVG_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .AVG_LSX_JRTABLE: .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE .AVG_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vadd.h vr2, vr0, vr1 vssrarni.bu.h vr3, vr2, bpc_sh vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .AVG_W4_LSX b .AVG_END_LSX .AVG_W8_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vssrarni.bu.h vr5, vr4, bpc_sh addi.w a5, a5, -2 addi.d a2, a2, 32 vstelm.d vr5, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr5, a0, 0, 1 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .AVG_W8_LSX b .AVG_END_LSX .AVG_W16_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vssrarni.bu.h vr5, vr4, bpc_sh addi.w a5, a5, -1 addi.d a2, a2, 32 vst vr5, a0, 0 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .AVG_W16_LSX b .AVG_END_LSX .AVG_W32_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr4, a2, 32 vld vr6, a2, 48 vld vr1, a3, 0 vld vr3, a3, 16 vld vr5, a3, 32 vld vr7, a3, 48 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vadd.h vr4, vr4, vr5 vadd.h vr6, vr6, vr7 vssrarni.bu.h vr2, vr0, bpc_sh vssrarni.bu.h vr6, vr4, bpc_sh addi.w a5, a5, -1 addi.d a2, a2, 64 vst vr2, a0, 0 vst vr6, a0, 16 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .AVG_W32_LSX b .AVG_END_LSX .AVG_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vssrarni.bu.h vr2, vr0, bpc_sh addi.d a2, a2, 32 addi.d a3, a3, 32 vst vr2, a0, 0 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .AVG_W64_LSX b .AVG_END_LSX .AVG_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vssrarni.bu.h vr2, vr0, bpc_sh addi.d a2, a2, 32 addi.d a3, a3, 32 vst vr2, a0, 0 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .AVG_W128_LSX .AVG_END_LSX: endfunc function avg_8bpc_lasx clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .AVG_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .AVG_LASX_JRTABLE: .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE .AVG_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 vadd.h vr0, vr0, vr1 vssrarni.bu.h vr1, vr0, bpc_sh vstelm.w vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr1, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .AVG_W4_LASX b .AVG_END_LASX .AVG_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvadd.h xr2, xr0, xr1 xvssrarni.bu.h xr1, xr2, bpc_sh xvstelm.d xr1, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr1, a0, 0, 2 addi.w a5, a5, -2 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a1, a0 blt zero, a5, .AVG_W8_LASX b .AVG_END_LASX .AVG_W16_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvadd.h xr4, xr0, xr1 xvadd.h xr5, xr2, xr3 xvssrarni.bu.h xr5, xr4, bpc_sh xvpermi.d xr2, xr5, 0xd8 xvpermi.d xr3, xr5, 0x8d vst vr2, a0, 0 vstx vr3, a0, a1 addi.w a5, a5, -2 addi.d a2, a2, 64 addi.d a3, a3, 64 alsl.d a0, a1, a0, 1 blt zero, a5, .AVG_W16_LASX b .AVG_END_LASX .AVG_W32_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvadd.h xr4, xr0, xr1 xvadd.h xr5, xr2, xr3 xvssrarni.bu.h xr5, xr4, bpc_sh xvpermi.d xr6, xr5, 0xd8 xvst xr6, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 64 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .AVG_W32_LASX b .AVG_END_LASX .AVG_W64_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr4, a2, 64 xvld xr6, a2, 96 xvld xr1, a3, 0 xvld xr3, a3, 32 xvld xr5, a3, 64 xvld xr7, a3, 96 xvadd.h xr0, xr0, xr1 xvadd.h xr2, xr2, xr3 xvadd.h xr4, xr4, xr5 xvadd.h xr6, xr6, xr7 xvssrarni.bu.h xr2, xr0, bpc_sh xvssrarni.bu.h xr6, xr4, bpc_sh xvpermi.d xr1, xr2, 0xd8 xvpermi.d xr3, xr6, 0xd8 xvst xr1, a0, 0 xvst xr3, a0, 32 addi.w a5, a5, -1 addi.d a2, a2, 128 addi.d a3, a3, 128 add.d a0, a0, a1 blt zero, a5, .AVG_W64_LASX b .AVG_END_LASX .AVG_W128_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr4, a2, 64 xvld xr6, a2, 96 xvld xr8, a2, 128 xvld xr10, a2, 160 xvld xr12, a2, 192 xvld xr14, a2, 224 xvld xr1, a3, 0 xvld xr3, a3, 32 xvld xr5, a3, 64 xvld xr7, a3, 96 xvld xr9, a3, 128 xvld xr11, a3, 160 xvld xr13, a3, 192 xvld xr15, a3, 224 xvadd.h xr0, xr0, xr1 xvadd.h xr2, xr2, xr3 xvadd.h xr4, xr4, xr5 xvadd.h xr6, xr6, xr7 xvadd.h xr8, xr8, xr9 xvadd.h xr10, xr10, xr11 xvadd.h xr12, xr12, xr13 xvadd.h xr14, xr14, xr15 xvssrarni.bu.h xr2, xr0, bpc_sh xvssrarni.bu.h xr6, xr4, bpc_sh xvssrarni.bu.h xr10, xr8, bpc_sh xvssrarni.bu.h xr14, xr12, bpc_sh xvpermi.d xr1, xr2, 0xd8 xvpermi.d xr3, xr6, 0xd8 xvpermi.d xr5, xr10, 0xd8 xvpermi.d xr7, xr14, 0xd8 xvst xr1, a0, 0 xvst xr3, a0, 32 xvst xr5, a0, 64 xvst xr7, a0, 96 addi.w a5, a5, -1 addi.d a2, a2, 256 addi.d a3, a3, 256 add.d a0, a0, a1 blt zero, a5, .AVG_W128_LASX .AVG_END_LASX: endfunc function w_avg_8bpc_lsx addi.d t8, a0, 0 li.w t2, 16 sub.w t2, t2, a6 // 16 - weight vreplgr2vr.h vr21, a6 vreplgr2vr.h vr22, t2 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .W_AVG_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .W_AVG_LSX_JRTABLE: .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE .W_AVG_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vmulwev.w.h vr2, vr0, vr21 vmulwod.w.h vr3, vr0, vr21 vmaddwev.w.h vr2, vr1, vr22 vmaddwod.w.h vr3, vr1, vr22 vssrarni.hu.w vr3, vr2, bpcw_sh vssrlni.bu.h vr1, vr3, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a1, a0 blt zero, a5, .W_AVG_W4_LSX b .W_AVG_END_LSX .W_AVG_W8_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vmulwev.w.h vr2, vr0, vr21 vmulwod.w.h vr3, vr0, vr21 vmaddwev.w.h vr2, vr1, vr22 vmaddwod.w.h vr3, vr1, vr22 vssrarni.hu.w vr3, vr2, bpcw_sh vssrlni.bu.h vr1, vr3, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.d f0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .W_AVG_W8_LSX b .W_AVG_END_LSX .W_AVG_W16_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W16_LSX b .W_AVG_END_LSX .W_AVG_W32_LSX: .rept 2 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W32_LSX b .W_AVG_END_LSX .W_AVG_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W64_LSX b .W_AVG_END_LSX .W_AVG_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W128_LSX .W_AVG_END_LSX: endfunc function w_avg_8bpc_lasx addi.d t8, a0, 0 li.w t2, 16 sub.w t2, t2, a6 // 16 - weight xvreplgr2vr.h xr21, a6 xvreplgr2vr.h xr22, t2 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .W_AVG_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .W_AVG_LASX_JRTABLE: .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE .W_AVG_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 xvpermi.d xr2, xr0, 0xD8 xvpermi.d xr3, xr1, 0xD8 xvilvl.h xr4, xr3, xr2 xvmulwev.w.h xr0, xr4, xr21 xvmaddwod.w.h xr0, xr4, xr22 xvssrarni.hu.w xr1, xr0, bpcw_sh xvssrlni.bu.h xr0, xr1, 0 fst.s f0, a0, 0 add.d a0, a0, a1 xvstelm.w xr0, a0, 0, 4 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a1, a0 blt zero, a5, .W_AVG_W4_LASX b .W_AVG_END_LASX .W_AVG_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvmulwev.w.h xr2, xr0, xr21 xvmulwod.w.h xr3, xr0, xr21 xvmaddwev.w.h xr2, xr1, xr22 xvmaddwod.w.h xr3, xr1, xr22 xvssrarni.hu.w xr3, xr2, bpcw_sh xvssrlni.bu.h xr1, xr3, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvstelm.d xr0, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr0, a0, 0, 2 addi.w a5, a5, -2 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W8_LASX b .W_AVG_END_LASX .W_AVG_W16_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvmulwev.w.h xr2, xr0, xr21 xvmulwod.w.h xr3, xr0, xr21 xvmaddwev.w.h xr2, xr1, xr22 xvmaddwod.w.h xr3, xr1, xr22 xvssrarni.hu.w xr3, xr2, bpcw_sh xvssrlni.bu.h xr1, xr3, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvpermi.d xr1, xr0, 0xD8 vst vr1, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W16_LASX b .W_AVG_END_LSX .W_AVG_W32_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 64 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .W_AVG_W32_LASX b .W_AVG_END_LASX .W_AVG_W64_LASX: .rept 2 xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a0, a0, 32 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W64_LASX b .W_AVG_END_LASX .W_AVG_W128_LASX: .rept 4 xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a0, a0, 32 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W128_LASX .W_AVG_END_LASX: endfunc #undef bpc_sh #undef bpcw_sh #define mask_sh 10 /* static void mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const uint8_t *mask HIGHBD_DECL_SUFFIX) */ function mask_8bpc_lsx vldi vr21, 0x440 // 64 vxor.v vr19, vr19, vr19 addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .MASK_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .MASK_LSX_JRTABLE: .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE .MASK_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 fld.d f22, a6, 0 vilvl.b vr2, vr19, vr22 vsub.h vr3, vr21, vr2 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vssrarni.hu.w vr5, vr4, mask_sh vssrlrni.bu.h vr1, vr5, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 addi.d a2, a2, 16 addi.d a3, a3, 16 addi.d a6, a6, 8 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W4_LSX b .MASK_END_LSX .MASK_W8_LSX: vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 fst.d f0, a0, 0 add.d a0, a0, a1 vstelm.d vr0, a0, 0, 1 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W8_LSX b .MASK_END_LSX .MASK_W16_LSX: vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W16_LSX b .MASK_END_LSX .MASK_W32_LSX: .rept 2 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W32_LSX b .MASK_END_LSX .MASK_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W64_LSX b .MASK_END_LSX .MASK_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W128_LSX .MASK_END_LSX: endfunc function mask_8bpc_lasx xvldi xr21, 0x440 // 64 xvxor.v xr19, xr19, xr19 addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .MASK_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .MASK_LASX_JRTABLE: .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE .MASK_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 fld.d f22, a6, 0 vilvl.h vr4, vr1, vr0 vilvh.h vr14, vr1, vr0 vilvl.b vr2, vr19, vr22 vsub.h vr3, vr21, vr2 xvpermi.q xr14, xr4, 0x20 vilvl.h vr5, vr3, vr2 vilvh.h vr15, vr3, vr2 xvpermi.q xr15, xr5, 0x20 xvmulwev.w.h xr0, xr14, xr15 xvmaddwod.w.h xr0, xr14, xr15 xvssrarni.hu.w xr1, xr0, mask_sh xvssrlni.bu.h xr2, xr1, 0 fst.s f2, a0, 0 add.d a0, a0, a1 xvstelm.w xr2, a0, 0, 4 addi.d a2, a2, 16 addi.d a3, a3, 16 addi.d a6, a6, 8 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W4_LASX b .MASK_END_LASX .MASK_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 vld vr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvsub.h xr3, xr21, xr2 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvssrarni.hu.w xr5, xr4, mask_sh xvssrlni.bu.h xr1, xr5, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 fst.d f0, a0, 0 add.d a0, a0, a1 xvstelm.d xr0, a0, 0, 2 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W8_LASX b .MASK_END_LASX .MASK_W16_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 vld vr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvsub.h xr3, xr21, xr2 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvssrarni.hu.w xr5, xr4, mask_sh xvssrlni.bu.h xr1, xr5, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvpermi.d xr1, xr0, 0xD8 vst vr1, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W16_LASX b .MASK_END_LASX .MASK_W32_LASX: xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W32_LASX b .MASK_END_LASX .MASK_W64_LASX: .rept 2 xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 addi.d a0, a0, 32 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W64_LASX b .MASK_END_LASX .MASK_W128_LASX: .rept 4 xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 addi.d a0, a0, 32 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W128_LASX .MASK_END_LASX: endfunc /* static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, uint8_t *mask, const int sign, const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) */ function w_mask_420_8bpc_lsx addi.d sp, sp, -24 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 vldi vr20, 0x440 vreplgr2vr.h vr21, a7 vldi vr22, 0x426 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .WMASK420_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t8, t0, 0 add.d t1, t1, t8 jirl $r0, t1, 0 .align 3 .WMASK420_LSX_JRTABLE: .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE .WMASK420_W4_LSX: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a3, 0 vld vr3, a3, 16 addi.w a5, a5, -4 vabsd.h vr4, vr0, vr2 vabsd.h vr5, vr1, vr3 vaddi.hu vr4, vr4, 8 vaddi.hu vr5, vr5, 8 vsrli.h vr4, vr4, 8 vsrli.h vr5, vr5, 8 vadd.h vr4, vr4, vr22 vadd.h vr5, vr5, vr22 vmin.hu vr6, vr4, vr20 vmin.hu vr7, vr5, vr20 vsub.h vr8, vr20, vr6 vsub.h vr9, vr20, vr7 vmulwev.w.h vr4, vr6, vr0 vmulwod.w.h vr5, vr6, vr0 vmulwev.w.h vr10, vr7, vr1 vmulwod.w.h vr11, vr7, vr1 vmaddwev.w.h vr4, vr8, vr2 vmaddwod.w.h vr5, vr8, vr2 vmaddwev.w.h vr10, vr9, vr3 vmaddwod.w.h vr11, vr9, vr3 vilvl.w vr0, vr5, vr4 vilvh.w vr1, vr5, vr4 vilvl.w vr2, vr11, vr10 vilvh.w vr3, vr11, vr10 vssrarni.hu.w vr1, vr0, 10 vssrarni.hu.w vr3, vr2, 10 vssrlni.bu.h vr3, vr1, 0 vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 2 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 3 add.d a0, a0, a1 vpickev.h vr0, vr7, vr6 vpickod.h vr1, vr7, vr6 vadd.h vr0, vr0, vr1 vshuf4i.h vr0, vr0, 0xd8 vhaddw.w.h vr2, vr0, vr0 vpickev.h vr2, vr2, vr2 vsub.h vr2, vr2, vr21 vaddi.hu vr2, vr2, 2 vssrani.bu.h vr2, vr2, 2 vstelm.w vr2, a6, 0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W4_LSX b .END_W420 .WMASK420_W8_LSX: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a3, 0 vld vr3, a3, 16 addi.w a5, a5, -2 vabsd.h vr4, vr0, vr2 vabsd.h vr5, vr1, vr3 vaddi.hu vr4, vr4, 8 vaddi.hu vr5, vr5, 8 vsrli.h vr4, vr4, 8 vsrli.h vr5, vr5, 8 vadd.h vr4, vr4, vr22 vadd.h vr5, vr5, vr22 vmin.hu vr6, vr4, vr20 vmin.hu vr7, vr5, vr20 vsub.h vr8, vr20, vr6 vsub.h vr9, vr20, vr7 vmulwev.w.h vr4, vr6, vr0 vmulwod.w.h vr5, vr6, vr0 vmulwev.w.h vr10, vr7, vr1 vmulwod.w.h vr11, vr7, vr1 vmaddwev.w.h vr4, vr8, vr2 vmaddwod.w.h vr5, vr8, vr2 vmaddwev.w.h vr10, vr9, vr3 vmaddwod.w.h vr11, vr9, vr3 vssrarni.hu.w vr10, vr4, 10 vssrarni.hu.w vr11, vr5, 10 vssrlni.bu.h vr11, vr10, 0 vshuf4i.w vr0, vr11, 0x4E vilvl.b vr3, vr0, vr11 vstelm.d vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr3, a0, 0, 1 add.d a0, a0, a1 vpickev.h vr0, vr7, vr6 vpickod.h vr1, vr7, vr6 vadd.h vr0, vr0, vr1 vilvh.d vr2, vr0, vr0 vadd.h vr2, vr2, vr0 vsub.h vr2, vr2, vr21 vaddi.hu vr2, vr2, 2 vssrani.bu.h vr2, vr2, 2 vstelm.w vr2, a6, 0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W8_LSX b .END_W420 .WMASK420_W16_LSX: vld vr0, a2, 0 vld vr1, a2, 16 alsl.d a2, a4, a2, 1 vld vr2, a2, 0 vld vr3, a2, 16 vld vr4, a3, 0 vld vr5, a3, 16 alsl.d a3, a4, a3, 1 vld vr6, a3, 0 vld vr7, a3, 16 vabsd.h vr8, vr0, vr4 vabsd.h vr9, vr1, vr5 vabsd.h vr10, vr2, vr6 vabsd.h vr11, vr3, vr7 vaddi.hu vr8, vr8, 8 vaddi.hu vr9, vr9, 8 vaddi.hu vr10, vr10, 8 vaddi.hu vr11, vr11, 8 vsrli.h vr8, vr8, 8 vsrli.h vr9, vr9, 8 vsrli.h vr10, vr10, 8 vsrli.h vr11, vr11, 8 vadd.h vr8, vr8, vr22 vadd.h vr9, vr9, vr22 vadd.h vr10, vr10, vr22 vadd.h vr11, vr11, vr22 vmin.hu vr12, vr8, vr20 vmin.hu vr13, vr9, vr20 vmin.hu vr14, vr10, vr20 vmin.hu vr15, vr11, vr20 vsub.h vr16, vr20, vr12 vsub.h vr17, vr20, vr13 vsub.h vr18, vr20, vr14 vsub.h vr19, vr20, vr15 vmulwev.w.h vr8, vr12, vr0 vmulwod.w.h vr9, vr12, vr0 vmulwev.w.h vr10, vr13, vr1 vmulwod.w.h vr11, vr13, vr1 vmulwev.w.h vr23, vr14, vr2 vmulwod.w.h vr24, vr14, vr2 vmulwev.w.h vr25, vr15, vr3 vmulwod.w.h vr26, vr15, vr3 vmaddwev.w.h vr8, vr16, vr4 vmaddwod.w.h vr9, vr16, vr4 vmaddwev.w.h vr10, vr17, vr5 vmaddwod.w.h vr11, vr17, vr5 vmaddwev.w.h vr23, vr18, vr6 vmaddwod.w.h vr24, vr18, vr6 vmaddwev.w.h vr25, vr19, vr7 vmaddwod.w.h vr26, vr19, vr7 vssrarni.hu.w vr10, vr8, 10 vssrarni.hu.w vr11, vr9, 10 vssrarni.hu.w vr25, vr23, 10 vssrarni.hu.w vr26, vr24, 10 vssrlni.bu.h vr11, vr10, 0 vssrlni.bu.h vr26, vr25, 0 vshuf4i.w vr0, vr11, 0x4E vshuf4i.w vr1, vr26, 0x4E vilvl.b vr3, vr0, vr11 vilvl.b vr7, vr1, vr26 vst vr3, a0, 0 vstx vr7, a0, a1 vpickev.h vr0, vr13, vr12 vpickod.h vr1, vr13, vr12 vpickev.h vr2, vr15, vr14 vpickod.h vr3, vr15, vr14 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vadd.h vr4, vr4, vr5 vsub.h vr4, vr4, vr21 vssrarni.bu.h vr4, vr4, 2 vstelm.d vr4, a6, 0, 0 alsl.d a2, a4, a2, 1 alsl.d a3, a4, a3, 1 alsl.d a0, a1, a0, 1 addi.d a6, a6, 8 addi.w a5, a5, -2 blt zero, a5, .WMASK420_W16_LSX b .END_W420 .WMASK420_W32_LSX: .WMASK420_W64_LSX: .WMASK420_W128_LSX: .LOOP_W32_420_LSX: add.d t1, a2, zero add.d t2, a3, zero add.d t3, a0, zero add.d t4, a6, zero alsl.d t5, a4, t1, 1 alsl.d t6, a4, t2, 1 or t7, a4, a4 .W32_420_LSX: vld vr0, t1, 0 vld vr1, t1, 16 vld vr2, t2, 0 vld vr3, t2, 16 vld vr4, t5, 0 vld vr5, t5, 16 vld vr6, t6, 0 vld vr7, t6, 16 addi.d t1, t1, 32 addi.d t2, t2, 32 addi.d t5, t5, 32 addi.d t6, t6, 32 addi.w t7, t7, -16 vabsd.h vr8, vr0, vr2 vabsd.h vr9, vr1, vr3 vabsd.h vr10, vr4, vr6 vabsd.h vr11, vr5, vr7 vaddi.hu vr8, vr8, 8 vaddi.hu vr9, vr9, 8 vaddi.hu vr10, vr10, 8 vaddi.hu vr11, vr11, 8 vsrli.h vr8, vr8, 8 vsrli.h vr9, vr9, 8 vsrli.h vr10, vr10, 8 vsrli.h vr11, vr11, 8 vadd.h vr8, vr8, vr22 vadd.h vr9, vr9, vr22 vadd.h vr10, vr10, vr22 vadd.h vr11, vr11, vr22 vmin.hu vr12, vr8, vr20 vmin.hu vr13, vr9, vr20 vmin.hu vr14, vr10, vr20 vmin.hu vr15, vr11, vr20 vsub.h vr16, vr20, vr12 vsub.h vr17, vr20, vr13 vsub.h vr18, vr20, vr14 vsub.h vr19, vr20, vr15 vmulwev.w.h vr8, vr12, vr0 vmulwod.w.h vr9, vr12, vr0 vmulwev.w.h vr10, vr13, vr1 vmulwod.w.h vr11, vr13, vr1 vmulwev.w.h vr23, vr14, vr4 vmulwod.w.h vr24, vr14, vr4 vmulwev.w.h vr25, vr15, vr5 vmulwod.w.h vr26, vr15, vr5 vmaddwev.w.h vr8, vr16, vr2 vmaddwod.w.h vr9, vr16, vr2 vmaddwev.w.h vr10, vr17, vr3 vmaddwod.w.h vr11, vr17, vr3 vmaddwev.w.h vr23, vr18, vr6 vmaddwod.w.h vr24, vr18, vr6 vmaddwev.w.h vr25, vr19, vr7 vmaddwod.w.h vr26, vr19, vr7 vssrarni.hu.w vr10, vr8, 10 vssrarni.hu.w vr11, vr9, 10 vssrarni.hu.w vr25, vr23, 10 vssrarni.hu.w vr26, vr24, 10 vssrlni.bu.h vr11, vr10, 0 vssrlni.bu.h vr26, vr25, 0 vshuf4i.w vr8, vr11, 0x4E vshuf4i.w vr9, vr26, 0x4E vilvl.b vr3, vr8, vr11 vilvl.b vr7, vr9, vr26 vst vr3, t3, 0 vstx vr7, a1, t3 addi.d t3, t3, 16 vpickev.h vr8, vr13, vr12 vpickod.h vr9, vr13, vr12 vpickev.h vr10, vr15, vr14 vpickod.h vr11, vr15, vr14 vadd.h vr8, vr8, vr9 vadd.h vr10, vr10, vr11 vadd.h vr12, vr8, vr10 vsub.h vr12, vr12, vr21 vssrarni.bu.h vr12, vr12, 2 vstelm.d vr12, t4, 0, 0 addi.d t4, t4, 8 bne t7, zero, .W32_420_LSX alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 alsl.d a0, a1, a0, 1 srai.w t8, a4, 1 add.d a6, a6, t8 addi.w a5, a5, -2 blt zero, a5, .LOOP_W32_420_LSX .END_W420: fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 addi.d sp, sp, 24 endfunc function w_mask_420_8bpc_lasx xvldi xr20, 0x440 xvreplgr2vr.h xr21, a7 xvldi xr22, 0x426 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .WMASK420_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t8, t0, 0 add.d t1, t1, t8 jirl $r0, t1, 0 .align 3 .WMASK420_LASX_JRTABLE: .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE .WMASK420_W4_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 addi.w a5, a5, -4 xvabsd.h xr2, xr0, xr1 xvaddi.hu xr2, xr2, 8 xvsrli.h xr2, xr2, 8 xvadd.h xr2, xr2, xr22 xvmin.hu xr3, xr2, xr20 xvsub.h xr4, xr20, xr3 xvmulwev.w.h xr5, xr3, xr0 xvmulwod.w.h xr6, xr3, xr0 xvmaddwev.w.h xr5, xr4, xr1 xvmaddwod.w.h xr6, xr4, xr1 xvilvl.w xr7, xr6, xr5 xvilvh.w xr8, xr6, xr5 xvssrarni.hu.w xr8, xr7, 10 xvssrlni.bu.h xr9, xr8, 0 vstelm.w vr9, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr9, a0, 0, 1 add.d a0, a0, a1 xvstelm.w xr9, a0, 0, 4 add.d a0, a0, a1 xvstelm.w xr9, a0, 0, 5 add.d a0, a0, a1 xvhaddw.w.h xr3, xr3, xr3 xvpermi.d xr4, xr3, 0xb1 xvadd.h xr3, xr3, xr4 xvpickev.h xr3, xr3, xr3 xvsub.h xr3, xr3, xr21 xvssrarni.bu.h xr3, xr3, 2 vstelm.h vr3, a6, 0, 0 xvstelm.h xr3, a6, 2, 8 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W4_LASX b .END_W420_LASX .WMASK420_W8_LASX: xvld xr0, a2, 0 xvld xr1, a2, 32 xvld xr2, a3, 0 xvld xr3, a3, 32 addi.w a5, a5, -4 xvabsd.h xr4, xr0, xr2 xvabsd.h xr5, xr1, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr6, xr4, xr20 xvmin.hu xr7, xr5, xr20 xvsub.h xr8, xr20, xr6 xvsub.h xr9, xr20, xr7 xvmulwev.w.h xr10, xr6, xr0 xvmulwod.w.h xr11, xr6, xr0 xvmulwev.w.h xr12, xr7, xr1 xvmulwod.w.h xr13, xr7, xr1 xvmaddwev.w.h xr10, xr8, xr2 xvmaddwod.w.h xr11, xr8, xr2 xvmaddwev.w.h xr12, xr9, xr3 xvmaddwod.w.h xr13, xr9, xr3 xvssrarni.hu.w xr12, xr10, 10 xvssrarni.hu.w xr13, xr11, 10 xvssrlni.bu.h xr13, xr12, 0 xvshuf4i.w xr1, xr13, 0x4E xvilvl.b xr17, xr1, xr13 vstelm.d vr17, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 2 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 1 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 3 add.d a0, a0, a1 xvhaddw.w.h xr6, xr6, xr6 xvhaddw.w.h xr7, xr7, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.q xr9, xr8, 0x01 vadd.h vr8, vr8, vr9 vsub.h vr8, vr8, vr21 vssrarni.bu.h vr8, vr8, 2 vstelm.d vr8, a6, 0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 8 blt zero, a5, .WMASK420_W8_LASX b .END_W420_LASX .WMASK420_W16_LASX: xvld xr0, a2, 0 xvld xr1, a2, 32 xvld xr2, a3, 0 xvld xr3, a3, 32 addi.w a5, a5, -2 xvabsd.h xr4, xr0, xr2 xvabsd.h xr5, xr1, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr4, xr4, xr20 xvmin.hu xr5, xr5, xr20 xvsub.h xr6, xr20, xr4 xvsub.h xr7, xr20, xr5 xvmulwev.w.h xr8, xr4, xr0 xvmulwod.w.h xr9, xr4, xr0 xvmulwev.w.h xr10, xr5, xr1 xvmulwod.w.h xr11, xr5, xr1 xvmaddwev.w.h xr8, xr6, xr2 xvmaddwod.w.h xr9, xr6, xr2 xvmaddwev.w.h xr10, xr7, xr3 xvmaddwod.w.h xr11, xr7, xr3 xvssrarni.hu.w xr10, xr8, 10 xvssrarni.hu.w xr11, xr9, 10 xvssrlni.bu.h xr11, xr10, 0 xvshuf4i.w xr8, xr11, 0x4E xvilvl.b xr15, xr8, xr11 xvpermi.d xr16, xr15, 0xd8 vst vr16, a0, 0 add.d a0, a0, a1 xvpermi.q xr16, xr16, 0x01 vst vr16, a0, 0 add.d a0, a0, a1 xvhaddw.w.h xr4, xr4, xr4 xvhaddw.w.h xr5, xr5, xr5 xvadd.h xr4, xr5, xr4 xvpickev.h xr6, xr4, xr4 xvpermi.d xr7, xr6, 0x08 vsub.h vr7, vr7, vr21 vssrarni.bu.h vr7, vr7, 2 vstelm.d vr7, a6, 0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 8 blt zero, a5, .WMASK420_W16_LASX b .END_W420_LASX .WMASK420_W32_LASX: .WMASK420_W64_LASX: .WMASK420_W128_LASX: .LOOP_W32_420_LASX: add.d t1, a2, zero add.d t2, a3, zero add.d t3, a0, zero add.d t4, a6, zero alsl.d t5, a4, t1, 1 alsl.d t6, a4, t2, 1 or t7, a4, a4 .W32_420_LASX: xvld xr0, t1, 0 xvld xr1, t2, 0 xvld xr2, t5, 0 xvld xr3, t6, 0 addi.d t1, t1, 32 addi.d t2, t2, 32 addi.d t5, t5, 32 addi.d t6, t6, 32 addi.w t7, t7, -16 xvabsd.h xr4, xr0, xr1 xvabsd.h xr5, xr2, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr6, xr4, xr20 xvmin.hu xr7, xr5, xr20 xvsub.h xr8, xr20, xr6 xvsub.h xr9, xr20, xr7 xvmulwev.w.h xr10, xr6, xr0 xvmulwod.w.h xr11, xr6, xr0 xvmulwev.w.h xr12, xr7, xr2 xvmulwod.w.h xr13, xr7, xr2 xvmaddwev.w.h xr10, xr8, xr1 xvmaddwod.w.h xr11, xr8, xr1 xvmaddwev.w.h xr12, xr9, xr3 xvmaddwod.w.h xr13, xr9, xr3 xvssrarni.hu.w xr12, xr10, 10 xvssrarni.hu.w xr13, xr11, 10 xvssrlni.bu.h xr13, xr12, 0 xvshuf4i.w xr10, xr13, 0x4E xvilvl.b xr17, xr10, xr13 xvpermi.d xr18, xr17, 0x08 xvpermi.d xr19, xr17, 0x0d vst vr18, t3, 0 vstx vr19, t3, a1 addi.d t3, t3, 16 xvhaddw.w.h xr6, xr6, xr6 xvhaddw.w.h xr7, xr7, xr7 xvadd.h xr6, xr7, xr6 xvpickev.h xr7, xr6, xr6 xvpermi.d xr8, xr7, 0x08 vsub.h vr9, vr8, vr21 vssrarni.bu.h vr9, vr9, 2 vstelm.d vr9, t4, 0, 0 addi.d t4, t4, 8 bne t7, zero, .W32_420_LASX alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 alsl.d a0, a1, a0, 1 srai.w t8, a4, 1 add.d a6, a6, t8 addi.w a5, a5, -2 blt zero, a5, .LOOP_W32_420_LASX .END_W420_LASX: endfunc #undef bpc_sh #undef bpcw_sh .macro vhaddw.d.h in0 vhaddw.w.h \in0, \in0, \in0 vhaddw.d.w \in0, \in0, \in0 .endm .macro vhaddw.q.w in0 vhaddw.d.w \in0, \in0, \in0 vhaddw.q.d \in0, \in0, \in0 .endm .macro PUT_H_8W in0 vbsrl.v vr2, \in0, 1 vbsrl.v vr3, \in0, 2 vbsrl.v vr4, \in0, 3 vbsrl.v vr5, \in0, 4 vbsrl.v vr6, \in0, 5 vbsrl.v vr7, \in0, 6 vbsrl.v vr10, \in0, 7 vilvl.d vr2, vr2, \in0 vilvl.d vr3, vr4, vr3 vilvl.d vr4, vr6, vr5 vilvl.d vr5, vr10, vr7 vdp2.h.bu.b \in0, vr2, vr8 vdp2.h.bu.b vr2, vr3, vr8 vdp2.h.bu.b vr3, vr4, vr8 vdp2.h.bu.b vr4, vr5, vr8 vhaddw.d.h \in0 vhaddw.d.h vr2 vhaddw.d.h vr3 vhaddw.d.h vr4 vpickev.w \in0, vr2, \in0 vpickev.w vr2, vr4, vr3 vpickev.h \in0, vr2, \in0 vadd.h \in0, \in0, vr9 .endm .macro FILTER_8TAP_4W in0 vbsrl.v vr10, \in0, 1 vbsrl.v vr11, \in0, 2 vbsrl.v vr12, \in0, 3 vilvl.d vr10, vr10, \in0 vilvl.d vr11, vr12, vr11 vdp2.h.bu.b vr7, vr10, vr8 vdp2.h.bu.b vr10, vr11, vr8 vhaddw.d.h vr7 vhaddw.d.h vr10 vpickev.w \in0, vr10, vr7 .endm .macro FILTER_8TAP_8W in0 vbsrl.v vr10, \in0, 1 vbsrl.v vr11, \in0, 2 vbsrl.v vr12, \in0, 3 vbsrl.v vr13, \in0, 4 vbsrl.v vr14, \in0, 5 vbsrl.v vr15, \in0, 6 vbsrl.v vr16, \in0, 7 vilvl.d vr10, vr10, \in0 vilvl.d vr11, vr12, vr11 vilvl.d vr12, vr14, vr13 vilvl.d vr13, vr16, vr15 vdp2.h.bu.b vr14, vr10, vr8 vdp2.h.bu.b vr15, vr11, vr8 vdp2.h.bu.b vr16, vr12, vr8 vdp2.h.bu.b vr17, vr13, vr8 vhaddw.d.h vr14 vhaddw.d.h vr15 vhaddw.d.h vr16 vhaddw.d.h vr17 vpickev.w vr13, vr15, vr14 vpickev.w vr14, vr17, vr16 vpickev.h \in0, vr14, vr13 //x0 ... x7 vsrari.h \in0, \in0, 2 .endm .macro FILTER_8TAP_8W_CLIP_STORE vdp2.w.h vr12, vr0, vr9 vdp2.w.h vr13, vr1, vr9 vdp2.w.h vr14, vr2, vr9 vdp2.w.h vr15, vr3, vr9 vdp2.w.h vr16, vr4, vr9 vdp2.w.h vr17, vr5, vr9 vdp2.w.h vr18, vr6, vr9 vdp2.w.h vr19, vr7, vr9 vhaddw.q.w vr12 vhaddw.q.w vr13 vhaddw.q.w vr14 vhaddw.q.w vr15 vhaddw.q.w vr16 vhaddw.q.w vr17 vhaddw.q.w vr18 vhaddw.q.w vr19 vpackev.w vr12, vr13, vr12 vpackev.w vr13, vr15, vr14 vpackev.d vr12, vr13, vr12 vpackev.w vr14, vr17, vr16 vpackev.w vr15, vr19, vr18 vpackev.d vr13, vr15, vr14 vssrarni.hu.w vr13, vr12, 10 vssrani.bu.h vr13, vr13, 0 vstelm.d vr13, a0, 0, 0 add.d a0, a0, a1 .endm .macro VEXTRINS_Hx8 in0 vextrins.h vr0, \in0, 0x70 vextrins.h vr1, \in0, 0x71 vextrins.h vr2, \in0, 0x72 vextrins.h vr3, \in0, 0x73 vextrins.h vr4, \in0, 0x74 vextrins.h vr5, \in0, 0x75 vextrins.h vr6, \in0, 0x76 vextrins.h vr7, \in0, 0x77 .endm .macro VBSRL_Vx8 vbsrl.v vr0, vr0, 2 vbsrl.v vr1, vr1, 2 vbsrl.v vr2, vr2, 2 vbsrl.v vr3, vr3, 2 vbsrl.v vr4, vr4, 2 vbsrl.v vr5, vr5, 2 vbsrl.v vr6, vr6, 2 vbsrl.v vr7, vr7, 2 .endm .macro PUT_8TAP_8BPC_LSX lable li.w t0, 4 la.local t6, dav1d_mc_subpel_filters slli.d t2, a3, 1 //src_stride*2 add.d t3, t2, a3 //src_stride*3 slli.d t4, t2, 1 //src_stride*4 bnez a6, .l_\lable\()put_h //mx bnez a7, .l_\lable\()put_v //my clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_hv0_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_hv0_jtable: .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable .l_\lable\()put_hv0_2w: vldrepl.h vr0, a2, 0 add.d a2, a2, a3 vldrepl.h vr1, a2, 0 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr1, a0, 0, 0 add.d a2, a2, a3 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_2w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_4w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fst.s f0, a0, 0 fstx.s f1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_4w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_8w: fld.d f0, a2, 0 fldx.d f1, a2, a3 fst.d f0, a0, 0 fstx.d f1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_8w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_16w: vld vr0, a2, 0 vldx vr1, a2, a3 vst vr0, a0, 0 vstx vr1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_16w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_32w: vld vr0, a2, 0 vld vr1, a2, 16 add.d a2, a2, a3 vld vr2, a2, 0 vld vr3, a2, 16 vst vr0, a0, 0 vst vr1, a0, 16 add.d a0, a0, a1 vst vr2, a0, 0 vst vr3, a0, 16 add.d a2, a2, a3 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_32w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_64w: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 add.d a2, a2, a3 vld vr4, a2, 0 vld vr5, a2, 16 vld vr6, a2, 32 vld vr7, a2, 48 add.d a2, a2, a3 vst vr0, a0, 0 vst vr1, a0, 16 vst vr2, a0, 32 vst vr3, a0, 48 add.d a0, a0, a1 vst vr4, a0, 0 vst vr5, a0, 16 vst vr6, a0, 32 vst vr7, a0, 48 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_64w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_128w: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 vld vr4, a2, 64 vld vr5, a2, 80 vld vr6, a2, 96 vld vr7, a2, 112 add.d a2, a2, a3 vld vr8, a2, 0 vld vr9, a2, 16 vld vr10, a2, 32 vld vr11, a2, 48 vld vr12, a2, 64 vld vr13, a2, 80 vld vr14, a2, 96 vld vr15, a2, 112 add.d a2, a2, a3 vst vr0, a0, 0 vst vr1, a0, 16 vst vr2, a0, 32 vst vr3, a0, 48 vst vr4, a0, 64 vst vr5, a0, 80 vst vr6, a0, 96 vst vr7, a0, 112 add.d a0, a0, a1 vst vr8, a0, 0 vst vr9, a0, 16 vst vr10, a0, 32 vst vr11, a0, 48 vst vr12, a0, 64 vst vr13, a0, 80 vst vr14, a0, 96 vst vr15, a0, 112 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_128w b .l_\lable\()end_put_8tap .l_\lable\()put_h: bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) ld.d t5, sp, 0 //filter_type andi t1, t5, 3 blt t0, a4, .l_\lable\()put_h_idx_fh andi t1, t5, 1 addi.w t1, t1, 3 .l_\lable\()put_h_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset vldrepl.d vr8, t1, 0 addi.d a2, a2, -3 li.w t1, 34 vreplgr2vr.h vr9, t1 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_h_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_h_jtable: .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable .l_\lable\()put_h_2w: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 vbsrl.v vr2, vr0, 1 vilvl.d vr0, vr2, vr0 vdp2.h.bu.b vr2, vr0, vr8 vhaddw.w.h vr0, vr2, vr2 vhaddw.d.w vr0, vr0, vr0 vbsrl.v vr2, vr1, 1 vilvl.d vr1, vr2, vr1 vdp2.h.bu.b vr2, vr1, vr8 vhaddw.w.h vr1, vr2, vr2 vhaddw.d.w vr1, vr1, vr1 vpickev.w vr0, vr1, vr0 vpickev.h vr0, vr0, vr0 vadd.h vr0, vr0, vr9 vssrani.bu.h vr0, vr0, 6 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr0, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_h_2w b .l_\lable\()end_put_8tap .l_\lable\()put_h_4w: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 vbsrl.v vr2, vr0, 1 vbsrl.v vr3, vr0, 2 vbsrl.v vr4, vr0, 3 vilvl.d vr0, vr2, vr0 //x0 x1 vilvl.d vr2, vr4, vr3 //x2 x3 vdp2.h.bu.b vr3, vr0, vr8 vdp2.h.bu.b vr4, vr2, vr8 vhaddw.w.h vr0, vr3, vr3 vhaddw.d.w vr0, vr0, vr0 vhaddw.w.h vr2, vr4, vr4 vhaddw.d.w vr2, vr2, vr2 vpickev.w vr5, vr2, vr0 vbsrl.v vr2, vr1, 1 vbsrl.v vr3, vr1, 2 vbsrl.v vr4, vr1, 3 vilvl.d vr0, vr2, vr1 //x0 x1 vilvl.d vr2, vr4, vr3 //x2 x3 vdp2.h.bu.b vr3, vr0, vr8 vdp2.h.bu.b vr4, vr2, vr8 vhaddw.w.h vr0, vr3, vr3 vhaddw.d.w vr0, vr0, vr0 vhaddw.w.h vr2, vr4, vr4 vhaddw.d.w vr2, vr2, vr2 vpickev.w vr6, vr2, vr0 vpickev.h vr0, vr6, vr5 vadd.h vr0, vr0, vr9 vssrani.bu.h vr0, vr0, 6 vstelm.w vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 add.d a0, a0, a1 addi.d a5, a5, -2 bnez a5, .l_\lable\()put_h_4w b .l_\lable\()end_put_8tap .l_\lable\()put_h_8w: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 PUT_H_8W vr0 PUT_H_8W vr1 vssrani.bu.h vr1, vr0, 6 vstelm.d vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr1, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_h_8w b .l_\lable\()end_put_8tap .l_\lable\()put_h_16w: .l_\lable\()put_h_32w: .l_\lable\()put_h_64w: .l_\lable\()put_h_128w: addi.d t0, a2, 0 //src addi.w t5, a5, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()put_h_16w_loop: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 PUT_H_8W vr0 PUT_H_8W vr1 vssrani.bu.h vr1, vr0, 6 vstelm.d vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr1, a0, 0, 1 add.d a0, a0, a1 addi.d a5, a5, -2 bnez a5, .l_\lable\()put_h_16w_loop addi.d a2, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.w a5, t5, 0 addi.w a4, a4, -8 bnez a4, .l_\lable\()put_h_16w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v: ld.d t1, sp, 0 //filter_type srli.w t1, t1, 2 blt t0, a5, .l_\lable\()put_v_idx_fv andi t1, t1, 1 addi.w t1, t1, 3 .l_\lable\()put_v_idx_fv: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a7, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fv's offset vldrepl.d vr8, t1, 0 sub.d a2, a2, t3 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_v_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_v_jtable: .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable .l_\lable\()put_v_2w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t2 add.d a2, a2, t3 fld.s f3, a2, 0 fldx.s f4, a2, a3 fldx.s f5, a2, t2 fldx.s f6, a2, t3 add.d a2, a2, t4 vilvl.b vr0, vr1, vr0 vilvl.b vr1, vr3, vr2 vilvl.b vr2, vr5, vr4 vilvl.b vr3, vr7, vr6 vilvl.h vr0, vr1, vr0 vilvl.h vr1, vr3, vr2 vilvl.w vr0, vr1, vr0 .l_\lable\()put_v_2w_loop: fld.s f7, a2, 0 //h0 fldx.s f10, a2, a3 //h1 add.d a2, a2, t2 vextrins.b vr0, vr7, 0x70 vextrins.b vr0, vr7, 0xf1 vbsrl.v vr1, vr0, 1 vextrins.b vr1, vr10, 0x70 vextrins.b vr1, vr10, 0xf1 vdp2.h.bu.b vr10, vr0, vr8 vdp2.h.bu.b vr11, vr1, vr8 vbsrl.v vr0, vr1, 1 vhaddw.d.h vr10 vhaddw.d.h vr11 vpickev.w vr10, vr11, vr10 vssrarni.hu.w vr10, vr10, 6 vssrani.bu.h vr10, vr10, 0 vstelm.h vr10, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr10, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_2w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v_4w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t2 add.d a2, a2, t3 fld.s f3, a2, 0 fldx.s f4, a2, a3 fldx.s f5, a2, t2 fldx.s f6, a2, t3 add.d a2, a2, t4 vilvl.b vr0, vr1, vr0 vilvl.b vr1, vr3, vr2 vilvl.b vr2, vr5, vr4 vilvl.b vr3, vr7, vr6 vilvl.h vr0, vr1, vr0 vilvl.h vr1, vr3, vr2 vilvl.w vr2, vr1, vr0 vilvh.w vr3, vr1, vr0 .l_\lable\()put_v_4w_loop: fld.s f7, a2, 0 fldx.s f10, a2, a3 add.d a2, a2, t2 vextrins.b vr2, vr7, 0x70 vextrins.b vr2, vr7, 0xf1 //x0x1(h0) vbsrl.v vr4, vr2, 1 vextrins.b vr4, vr10, 0x70 vextrins.b vr4, vr10, 0xf1 //x0x1(h1) vdp2.h.bu.b vr11, vr2, vr8 vdp2.h.bu.b vr12, vr4, vr8 vbsrl.v vr2, vr4, 1 vextrins.b vr3, vr7, 0x72 vextrins.b vr3, vr7, 0xf3 //x2x3(h0) vbsrl.v vr4, vr3, 1 vextrins.b vr4, vr10, 0x72 vextrins.b vr4, vr10, 0xf3 //x2x3(h1) vdp2.h.bu.b vr13, vr3, vr8 vdp2.h.bu.b vr14, vr4, vr8 vbsrl.v vr3, vr4, 1 vhaddw.d.h vr11 vhaddw.d.h vr12 vhaddw.d.h vr13 vhaddw.d.h vr14 vpickev.w vr11, vr13, vr11 vpickev.w vr12, vr14, vr12 vpickev.h vr11, vr12, vr11 vssrarni.bu.h vr11, vr11, 6 vstelm.w vr11, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr11, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_4w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v_8w: .l_\lable\()put_v_16w: .l_\lable\()put_v_32w: .l_\lable\()put_v_64w: .l_\lable\()put_v_128w: addi.d t0, a2, 0 //src addi.d t5, a5, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()put_v_8w_loop0: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t2 add.d a2, a2, t3 fld.d f3, a2, 0 fldx.d f4, a2, a3 fldx.d f5, a2, t2 fldx.d f6, a2, t3 add.d a2, a2, t4 vilvl.b vr0, vr1, vr0 vilvl.b vr1, vr3, vr2 vilvl.b vr2, vr5, vr4 vilvl.b vr3, vr7, vr6 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr6, vr3, vr2 vilvh.h vr7, vr3, vr2 vilvl.w vr0, vr6, vr4 // x0x1 vilvh.w vr1, vr6, vr4 // x2x3 vilvl.w vr2, vr7, vr5 // x4x5 vilvh.w vr3, vr7, vr5 // x6x7 .l_\lable\()put_v_8w_loop: fld.d f7, a2, 0 fldx.d f10, a2, a3 add.d a2, a2, t2 //h0 vextrins.b vr0, vr7, 0x70 vextrins.b vr0, vr7, 0xf1 vextrins.b vr1, vr7, 0x72 vextrins.b vr1, vr7, 0xf3 vextrins.b vr2, vr7, 0x74 vextrins.b vr2, vr7, 0xf5 vextrins.b vr3, vr7, 0x76 vextrins.b vr3, vr7, 0xf7 vdp2.h.bu.b vr11, vr0, vr8 vdp2.h.bu.b vr12, vr1, vr8 vdp2.h.bu.b vr13, vr2, vr8 vdp2.h.bu.b vr14, vr3, vr8 vhaddw.d.h vr11 vhaddw.d.h vr12 vhaddw.d.h vr13 vhaddw.d.h vr14 vpickev.w vr11, vr12, vr11 vpickev.w vr12, vr14, vr13 vpickev.h vr11, vr12, vr11 vssrarni.bu.h vr11, vr11, 6 fst.d f11, a0, 0 add.d a0, a0, a1 //h1 vbsrl.v vr0, vr0, 1 vbsrl.v vr1, vr1, 1 vbsrl.v vr2, vr2, 1 vbsrl.v vr3, vr3, 1 vextrins.b vr0, vr10, 0x70 vextrins.b vr0, vr10, 0xf1 vextrins.b vr1, vr10, 0x72 vextrins.b vr1, vr10, 0xf3 vextrins.b vr2, vr10, 0x74 vextrins.b vr2, vr10, 0xf5 vextrins.b vr3, vr10, 0x76 vextrins.b vr3, vr10, 0xf7 vdp2.h.bu.b vr11, vr0, vr8 vdp2.h.bu.b vr12, vr1, vr8 vdp2.h.bu.b vr13, vr2, vr8 vdp2.h.bu.b vr14, vr3, vr8 vhaddw.d.h vr11 vhaddw.d.h vr12 vhaddw.d.h vr13 vhaddw.d.h vr14 vpickev.w vr11, vr12, vr11 vpickev.w vr12, vr14, vr13 vpickev.h vr11, vr12, vr11 vssrarni.bu.h vr11, vr11, 6 fst.d f11, a0, 0 add.d a0, a0, a1 vbsrl.v vr0, vr0, 1 vbsrl.v vr1, vr1, 1 vbsrl.v vr2, vr2, 1 vbsrl.v vr3, vr3, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_8w_loop addi.d a2, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.d a5, t5, 0 addi.w a4, a4, -8 bnez a4, .l_\lable\()put_v_8w_loop0 b .l_\lable\()end_put_8tap .l_\lable\()put_hv: ld.d t5, sp, 0 //filter_type andi t1, t5, 3 blt t0, a4, .l_\lable\()put_hv_idx_fh andi t1, t5, 1 addi.w t1, t1, 3 .l_\lable\()put_hv_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset vldrepl.d vr8, t1, 0 ld.d t1, sp, 0 //filter_type srli.w t1, t1, 2 blt t0, a5, .l_\lable\()put_hv_idx_fv andi t1, t1, 1 addi.w t1, t1, 3 .l_\lable\()put_hv_idx_fv: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a7, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fv's offset vldrepl.d vr9, t1, 0 vexth.h.b vr9, vr9 sub.d a2, a2, t3 addi.d a2, a2, -3 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_hv_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_hv_jtable: .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable .l_\lable\()put_hv_2w: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 vbsrl.v vr10, vr0, 1 vbsrl.v vr11, vr1, 1 vbsrl.v vr12, vr2, 1 vbsrl.v vr13, vr3, 1 vbsrl.v vr14, vr4, 1 vbsrl.v vr15, vr5, 1 vbsrl.v vr16, vr6, 1 vilvl.d vr0, vr10, vr0 vilvl.d vr1, vr11, vr1 vilvl.d vr2, vr12, vr2 vilvl.d vr3, vr13, vr3 vilvl.d vr4, vr14, vr4 vilvl.d vr5, vr15, vr5 vilvl.d vr6, vr16, vr6 vdp2.h.bu.b vr10, vr0, vr8 vdp2.h.bu.b vr11, vr1, vr8 vdp2.h.bu.b vr12, vr2, vr8 vdp2.h.bu.b vr13, vr3, vr8 vdp2.h.bu.b vr14, vr4, vr8 vdp2.h.bu.b vr15, vr5, vr8 vdp2.h.bu.b vr16, vr6, vr8 vhaddw.d.h vr10 vhaddw.d.h vr11 vhaddw.d.h vr12 vhaddw.d.h vr13 vhaddw.d.h vr14 vhaddw.d.h vr15 vhaddw.d.h vr16 vpackev.w vr10, vr11, vr10 vpackev.w vr12, vr13, vr12 vpackod.d vr11, vr12, vr10 vpackev.d vr10, vr12, vr10 vpackev.w vr12, vr15, vr14 vpackev.w vr16, vr17, vr16 vpackod.d vr13, vr16, vr12 vpackev.d vr12, vr16, vr12 vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0) vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1) vsrari.h vr10, vr10, 2 vsrari.h vr11, vr11, 2 .l_\lable\()put_hv_2w_loop: vld vr7, a2, 0 vldx vr12, a2, a3 add.d a2, a2, t2 vbsrl.v vr1, vr7, 1 vbsrl.v vr2, vr12, 1 vilvl.d vr0, vr1, vr7 vilvl.d vr1, vr2, vr12 vdp2.h.bu.b vr2, vr0, vr8 vdp2.h.bu.b vr3, vr1, vr8 vhaddw.d.h vr2 vhaddw.d.h vr3 vpickev.w vr2, vr3, vr2 vpickev.h vr2, vr2, vr2 vsrari.h vr2, vr2, 2 vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7 vextrins.h vr11, vr2, 0x71 vbsrl.v vr12, vr10, 2 vbsrl.v vr13, vr11, 2 vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8 vextrins.h vr13, vr2, 0x73 vdp2.w.h vr0, vr10, vr9 vdp2.w.h vr1, vr11, vr9 vdp2.w.h vr2, vr12, vr9 vdp2.w.h vr3, vr13, vr9 vhaddw.q.w vr0 vhaddw.q.w vr1 vhaddw.q.w vr2 vhaddw.q.w vr3 vpackev.w vr0, vr1, vr0 vpackev.w vr1, vr3, vr2 vpackev.d vr0, vr1, vr0 vssrarni.hu.w vr0, vr0, 10 vssrani.bu.h vr0, vr0, 0 vbsrl.v vr10, vr12, 2 vbsrl.v vr11, vr13, 2 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr0, a0, 0, 1 add.d a0, a0, a1 addi.d a5, a5, -2 bnez a5, .l_\lable\()put_hv_2w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_hv_4w: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 FILTER_8TAP_4W vr0 //x0 x1 x2 x3 FILTER_8TAP_4W vr1 FILTER_8TAP_4W vr2 FILTER_8TAP_4W vr3 FILTER_8TAP_4W vr4 FILTER_8TAP_4W vr5 FILTER_8TAP_4W vr6 vpackev.h vr0, vr1, vr0 vpackev.h vr1, vr3, vr2 vpackev.h vr2, vr5, vr4 vpackev.h vr3, vr7, vr6 vilvl.w vr4, vr1, vr0 vilvh.w vr5, vr1, vr0 vilvl.w vr6, vr3, vr2 vilvh.w vr7, vr3, vr2 vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 * vilvh.d vr1, vr6, vr4 vilvl.d vr2, vr7, vr5 vilvh.d vr3, vr7, vr5 vsrari.h vr0, vr0, 2 vsrari.h vr1, vr1, 2 vsrari.h vr2, vr2, 2 vsrari.h vr3, vr3, 2 .l_\lable\()put_hv_4w_loop: vld vr4, a2, 0 vldx vr5, a2, a3 add.d a2, a2, t2 FILTER_8TAP_4W vr4 FILTER_8TAP_4W vr5 vpickev.h vr4, vr5, vr4 vsrari.h vr4, vr4, 2 vextrins.h vr0, vr4, 0x70 vextrins.h vr1, vr4, 0x71 vextrins.h vr2, vr4, 0x72 vextrins.h vr3, vr4, 0x73 vbsrl.v vr5, vr0, 2 vbsrl.v vr6, vr1, 2 vbsrl.v vr7, vr2, 2 vbsrl.v vr10, vr3, 2 vextrins.h vr5, vr4, 0x74 vextrins.h vr6, vr4, 0x75 vextrins.h vr7, vr4, 0x76 vextrins.h vr10, vr4, 0x77 vdp2.w.h vr11, vr0, vr9 vdp2.w.h vr12, vr1, vr9 vdp2.w.h vr13, vr2, vr9 vdp2.w.h vr14, vr3, vr9 vhaddw.q.w vr11 vhaddw.q.w vr12 vhaddw.q.w vr13 vhaddw.q.w vr14 vpackev.w vr0, vr12, vr11 vpackev.w vr1, vr14, vr13 vpackev.d vr0, vr1, vr0 vdp2.w.h vr11, vr5, vr9 vdp2.w.h vr12, vr6, vr9 vdp2.w.h vr13, vr7, vr9 vdp2.w.h vr14, vr10, vr9 vhaddw.q.w vr11 vhaddw.q.w vr12 vhaddw.q.w vr13 vhaddw.q.w vr14 vpackev.w vr1, vr12, vr11 vpackev.w vr2, vr14, vr13 vpackev.d vr1, vr2, vr1 vssrarni.hu.w vr1, vr0, 10 vssrani.bu.h vr1, vr1, 0 vstelm.w vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr1, a0, 0, 1 add.d a0, a0, a1 vbsrl.v vr0, vr5, 2 vbsrl.v vr1, vr6, 2 vbsrl.v vr2, vr7, 2 vbsrl.v vr3, vr10, 2 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv_4w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_hv_8w: .l_\lable\()put_hv_16w: .l_\lable\()put_hv_32w: .l_\lable\()put_hv_64w: .l_\lable\()put_hv_128w: addi.d t0, a2, 0 //src addi.d t5, a5, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()put_hv_8w_loop0: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 FILTER_8TAP_8W vr0 FILTER_8TAP_8W vr1 FILTER_8TAP_8W vr2 FILTER_8TAP_8W vr3 FILTER_8TAP_8W vr4 FILTER_8TAP_8W vr5 FILTER_8TAP_8W vr6 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17 .l_\lable\()put_hv_8w_loop: vld vr20, a2, 0 vldx vr21, a2, a3 add.d a2, a2, t2 FILTER_8TAP_8W vr20 FILTER_8TAP_8W vr21 VEXTRINS_Hx8 vr20 FILTER_8TAP_8W_CLIP_STORE VBSRL_Vx8 VEXTRINS_Hx8 vr21 FILTER_8TAP_8W_CLIP_STORE VBSRL_Vx8 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv_8w_loop addi.d a2, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.d a5, t5, 0 addi.w a4, a4, -8 bnez a4, .l_\lable\()put_hv_8w_loop0 .l_\lable\()end_put_8tap: .endm function put_8tap_regular_8bpc_lsx addi.d sp, sp, -16 st.d zero, sp, 0 PUT_8TAP_8BPC_LSX 0 addi.d sp, sp, 16 endfunc function put_8tap_smooth_regular_8bpc_lsx addi.d sp, sp, -16 li.w t0, 1 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 1 addi.d sp, sp, 16 endfunc function put_8tap_sharp_regular_8bpc_lsx addi.d sp, sp, -16 li.w t0, 2 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 2 addi.d sp, sp, 16 endfunc function put_8tap_regular_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 4 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 4 addi.d sp, sp, 16 endfunc function put_8tap_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 5 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 5 addi.d sp, sp, 16 endfunc function put_8tap_sharp_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 6 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 6 addi.d sp, sp, 16 endfunc function put_8tap_regular_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 8 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 8 addi.d sp, sp, 16 endfunc function put_8tap_smooth_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 9 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 9 addi.d sp, sp, 16 endfunc function put_8tap_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 10 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 10 addi.d sp, sp, 16 endfunc const shufb1 .byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 endconst .macro SHUFB in0, in1, tmp, out xvbsrl.v \tmp, \in0, 2 xvpermi.q \tmp, \in0, 0x20 xvshuf.b \out, \tmp, \tmp, \in1 .endm .macro HADDWDH in0 xvhaddw.w.h \in0, \in0, \in0 xvhaddw.d.w \in0, \in0, \in0 .endm .macro HADDWQW in0 xvhaddw.d.w \in0, \in0, \in0 xvhaddw.q.d \in0, \in0, \in0 .endm .macro PREP_W16_H in0 xvbsrl.v xr4, \in0, 4 xvbsrl.v xr5, \in0, 8 xvpermi.q xr9, \in0, 0x31 xvpackev.d xr5, xr9, xr5 xvbsrl.v xr6, xr5, 4 SHUFB \in0, xr23, xr9, \in0 SHUFB xr4, xr23, xr9, xr4 SHUFB xr5, xr23, xr9, xr5 SHUFB xr6, xr23, xr9, xr6 xvdp2.h.bu.b xr10, \in0, xr22 xvdp2.h.bu.b xr11, xr4, xr22 xvdp2.h.bu.b xr12, xr5, xr22 xvdp2.h.bu.b xr13, xr6, xr22 HADDWDH xr10 HADDWDH xr11 HADDWDH xr12 HADDWDH xr13 xvpickev.w xr10, xr11, xr10 xvpickev.w xr11, xr13, xr12 xvpermi.d xr10, xr10, 0xd8 xvpermi.d xr11, xr11, 0xd8 xvpickev.h xr10, xr11, xr10 xvpermi.d xr10, xr10, 0xd8 xvsrari.h \in0, xr10, 2 .endm .macro PREP_8TAP_8BPC_LASX lable li.w t0, 4 la.local t6, dav1d_mc_subpel_filters la.local t7, shufb1 xvld xr23, t7, 0 slli.d t2, a2, 1 //src_stride*2 add.d t3, t2, a2 //src_stride*3 slli.d t4, t2, 1 bnez a5, .l_\lable\()h //mx bnez a6, .l_\lable\()v clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_hv0_jtable alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_hv0_jtable: .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable .l_\lable\()hv0_4w: fld.s f0, a1, 0 fldx.s f1, a1, a2 fldx.s f2, a1, t2 fldx.s f3, a1, t3 add.d a1, a1, t4 xvpackev.w xr0, xr1, xr0 xvpackev.w xr1, xr3, xr2 xvpermi.q xr0, xr1, 0x02 xvsllwil.hu.bu xr0, xr0, 4 xvst xr0, a0, 0 addi.d a0, a0, 32 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_4w b .l_\lable\()end_pre_8tap .l_\lable\()hv0_8w: fld.d f0, a1, 0 fldx.d f1, a1, a2 fldx.d f2, a1, t2 fldx.d f3, a1, t3 add.d a1, a1, t4 xvpermi.q xr0, xr1, 0x02 xvpermi.q xr2, xr3, 0x02 xvsllwil.hu.bu xr0, xr0, 4 xvsllwil.hu.bu xr2, xr2, 4 xvst xr0, a0, 0 xvst xr2, a0, 32 addi.d a0, a0, 64 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_8w b .l_\lable\()end_pre_8tap .l_\lable\()hv0_16w: vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 vldx vr3, a1, t3 add.d a1, a1, t4 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 xvslli.h xr0, xr0, 4 xvslli.h xr1, xr1, 4 xvslli.h xr2, xr2, 4 xvslli.h xr3, xr3, 4 xvst xr0, a0, 0 xvst xr1, a0, 32 xvst xr2, a0, 64 xvst xr3, a0, 96 addi.d a0, a0, 128 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_16w b .l_\lable\()end_pre_8tap .l_\lable\()hv0_32w: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvpermi.d xr4, xr0, 0xD8 xvpermi.d xr5, xr1, 0xD8 xvpermi.d xr6, xr2, 0xD8 xvpermi.d xr7, xr3, 0xD8 xvpermi.d xr10, xr0, 0x32 xvpermi.d xr11, xr1, 0x32 xvpermi.d xr12, xr2, 0x32 xvpermi.d xr13, xr3, 0x32 xvsllwil.hu.bu xr0, xr4, 4 xvsllwil.hu.bu xr1, xr5, 4 xvsllwil.hu.bu xr2, xr6, 4 xvsllwil.hu.bu xr3, xr7, 4 xvsllwil.hu.bu xr4, xr10, 4 xvsllwil.hu.bu xr5, xr11, 4 xvsllwil.hu.bu xr6, xr12, 4 xvsllwil.hu.bu xr7, xr13, 4 xvst xr0, a0, 0 xvst xr4, a0, 32 xvst xr1, a0, 64 xvst xr5, a0, 96 xvst xr2, a0, 128 xvst xr6, a0, 160 xvst xr3, a0, 192 xvst xr7, a0, 224 addi.d a0, a0, 256 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_32w b .l_\lable\()end_pre_8tap .l_\lable\()hv0_64w: .l_\lable\()hv0_128w: addi.d t0, a1, 0 addi.d t5, a4, 0 srli.w t7, a3, 5 slli.w t7, t7, 6 addi.d t8, a0, 0 .l_\lable\()hv0_32_loop: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvpermi.d xr4, xr0, 0xD8 xvpermi.d xr5, xr1, 0xD8 xvpermi.d xr6, xr2, 0xD8 xvpermi.d xr7, xr3, 0xD8 xvpermi.d xr10, xr0, 0x32 xvpermi.d xr11, xr1, 0x32 xvpermi.d xr12, xr2, 0x32 xvpermi.d xr13, xr3, 0x32 xvsllwil.hu.bu xr0, xr4, 4 xvsllwil.hu.bu xr1, xr5, 4 xvsllwil.hu.bu xr2, xr6, 4 xvsllwil.hu.bu xr3, xr7, 4 xvsllwil.hu.bu xr4, xr10, 4 xvsllwil.hu.bu xr5, xr11, 4 xvsllwil.hu.bu xr6, xr12, 4 xvsllwil.hu.bu xr7, xr13, 4 xvst xr0, a0, 0 xvst xr4, a0, 32 add.d t1, a0, t7 xvst xr1, t1, 0 xvst xr5, t1, 32 add.d t1, t1, t7 xvst xr2, t1, 0 xvst xr6, t1, 32 add.d t1, t1, t7 xvst xr3, t1, 0 xvst xr7, t1, 32 add.d a0, t1, t7 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_32_loop addi.d a1, t0, 32 addi.d t0, t0, 32 addi.d a0, t8, 64 addi.d t8, t8, 64 addi.d a4, t5, 0 addi.d a3, a3, -32 bnez a3, .l_\lable\()hv0_32_loop b .l_\lable\()end_pre_8tap .l_\lable\()h: bnez a6, .l_\lable\()hv //if(fh) && if (fv) andi t1, a7, 3 blt t0, a3, .l_\lable\()h_idx_fh andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()h_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset xvldrepl.d xr22, t1, 0 addi.d a1, a1, -3 clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_h_jtable alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_h_jtable: .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable .l_\lable\()h_4w: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 SHUFB xr0, xr23, xr9, xr0 SHUFB xr1, xr23, xr9, xr1 SHUFB xr2, xr23, xr9, xr2 SHUFB xr3, xr23, xr9, xr3 xvdp2.h.bu.b xr10, xr0, xr22 xvdp2.h.bu.b xr12, xr1, xr22 xvdp2.h.bu.b xr14, xr2, xr22 xvdp2.h.bu.b xr16, xr3, xr22 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 HADDWDH xr14 //h2 HADDWDH xr16 //h3 xvpickev.w xr10, xr12, xr10 xvpickev.w xr14, xr16, xr14 xvpermi.d xr10, xr10, 0xd8 xvpermi.d xr14, xr14, 0xd8 xvpickev.h xr10, xr14, xr10 xvpermi.d xr10, xr10, 0xd8 xvsrari.h xr10, xr10, 2 xvst xr10, a0, 0 addi.d a0, a0, 32 addi.w a4, a4, -4 bnez a4, .l_\lable\()h_4w b .l_\lable\()end_pre_8tap .l_\lable\()h_8w: xvld xr0, a1, 0 xvldx xr2, a1, a2 xvldx xr4, a1, t2 xvldx xr6, a1, t3 add.d a1, a1, t4 xvbsrl.v xr1, xr0, 4 xvbsrl.v xr3, xr2, 4 xvbsrl.v xr5, xr4, 4 xvbsrl.v xr7, xr6, 4 SHUFB xr0, xr23, xr9, xr10 SHUFB xr1, xr23, xr9, xr11 SHUFB xr2, xr23, xr9, xr12 SHUFB xr3, xr23, xr9, xr13 SHUFB xr4, xr23, xr9, xr14 SHUFB xr5, xr23, xr9, xr15 SHUFB xr6, xr23, xr9, xr16 SHUFB xr7, xr23, xr9, xr17 xvdp2.h.bu.b xr0, xr10, xr22 xvdp2.h.bu.b xr1, xr11, xr22 xvdp2.h.bu.b xr2, xr12, xr22 xvdp2.h.bu.b xr3, xr13, xr22 xvdp2.h.bu.b xr4, xr14, xr22 xvdp2.h.bu.b xr5, xr15, xr22 xvdp2.h.bu.b xr6, xr16, xr22 xvdp2.h.bu.b xr7, xr17, xr22 HADDWDH xr0 HADDWDH xr1 HADDWDH xr2 HADDWDH xr3 HADDWDH xr4 HADDWDH xr5 HADDWDH xr6 HADDWDH xr7 xvpickev.w xr0, xr1, xr0 xvpickev.w xr2, xr3, xr2 xvpermi.d xr0, xr0, 0xd8 xvpermi.d xr2, xr2, 0xd8 xvpickev.h xr0, xr2, xr0 xvpermi.d xr0, xr0, 0xd8 xvsrari.h xr0, xr0, 2 xvpickev.w xr4, xr5, xr4 xvpickev.w xr6, xr7, xr6 xvpermi.d xr4, xr4, 0xd8 xvpermi.d xr6, xr6, 0xd8 xvpickev.h xr4, xr6, xr4 xvpermi.d xr4, xr4, 0xd8 xvsrari.h xr4, xr4, 2 xvst xr0, a0, 0 xvst xr4, a0, 32 addi.d a0, a0, 64 addi.d a4, a4, -4 bnez a4, .l_\lable\()h_8w b .l_\lable\()end_pre_8tap .l_\lable\()h_16w: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 PREP_W16_H xr0 PREP_W16_H xr1 PREP_W16_H xr2 PREP_W16_H xr3 xvst xr0, a0, 0 xvst xr1, a0, 32 xvst xr2, a0, 64 xvst xr3, a0, 96 addi.d a0, a0, 128 addi.w a4, a4, -4 bnez a4, .l_\lable\()h_16w b .l_\lable\()end_pre_8tap .l_\lable\()h_32w: .l_\lable\()h_64w: .l_\lable\()h_128w: addi.d t0, a1, 0 //src addi.d t5, a4, 0 //h srli.w t7, a3, 4 //w slli.w t7, t7, 5 //store offset addi.d t8, a0, 0 //dst .l_\lable\()h_16_loop: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 PREP_W16_H xr0 PREP_W16_H xr1 PREP_W16_H xr2 PREP_W16_H xr3 xvst xr0, a0, 0 xvstx xr1, a0, t7 slli.w t1, t7, 1 xvstx xr2, a0, t1 add.w t1, t1, t7 xvstx xr3, a0, t1 slli.w t1, t7, 2 add.d a0, a0, t1 addi.d a4, a4, -4 bnez a4, .l_\lable\()h_16_loop addi.d a1, t0, 16 addi.d t0, t0, 16 addi.d a0, t8, 32 addi.d t8, t8, 32 addi.d a4, t5, 0 addi.d a3, a3, -16 bnez a3, .l_\lable\()h_16_loop b .l_\lable\()end_pre_8tap .l_\lable\()hv: andi t1, a7, 3 blt t0, a3, .l_\lable\()hv_idx_fh andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()hv_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset xvldrepl.d xr22, t1, 0 srli.w a7, a7, 2 blt t0, a4, .l_\lable\()hv_idx_fv andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()hv_idx_fv: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset xvldrepl.d xr8, a7, 0 xvsllwil.h.b xr8, xr8, 0 sub.d a1, a1, t3 addi.d a1, a1, -3 beq a3, t0, .l_\lable\()hv_4w b .l_\lable\()hv_8w .l_\lable\()hv_4w: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvld xr4, a1, 0 xvldx xr5, a1, a2 xvldx xr6, a1, t2 SHUFB xr0, xr23, xr9, xr0 SHUFB xr1, xr23, xr9, xr1 SHUFB xr2, xr23, xr9, xr2 SHUFB xr3, xr23, xr9, xr3 SHUFB xr4, xr23, xr9, xr4 SHUFB xr5, xr23, xr9, xr5 SHUFB xr6, xr23, xr9, xr6 xvdp2.h.bu.b xr10, xr0, xr22 xvdp2.h.bu.b xr11, xr1, xr22 xvdp2.h.bu.b xr12, xr2, xr22 xvdp2.h.bu.b xr13, xr3, xr22 xvdp2.h.bu.b xr14, xr4, xr22 xvdp2.h.bu.b xr15, xr5, xr22 xvdp2.h.bu.b xr16, xr6, xr22 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 HADDWDH xr11 //h1 mid4 mid5 mid6 mid7 HADDWDH xr12 //h2 HADDWDH xr13 //h3 xvpackev.w xr10, xr11, xr10 xvpackev.w xr12, xr13, xr12 xvpackev.d xr11, xr12, xr10 xvpackod.d xr10, xr12, xr10 xvpickev.h xr11, xr10, xr11 xvsrari.h xr11, xr11, 2 HADDWDH xr14 //h4 HADDWDH xr15 //h5 HADDWDH xr16 //h6 xvpackev.w xr14, xr15, xr14 xvpackev.w xr16, xr17, xr16 xvpackev.d xr17, xr16, xr14 xvpackod.d xr14, xr16, xr14 xvpickev.h xr13, xr14, xr17 xvsrari.h xr13, xr13, 2 xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 * xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 * .l_\lable\()hv_w4_loop: xvldx xr0, a1, t3 add.d a1, a1, t4 xvld xr1, a1, 0 xvldx xr2, a1, a2 xvldx xr3, a1, t2 SHUFB xr0, xr23, xr9, xr0 SHUFB xr1, xr23, xr9, xr1 SHUFB xr2, xr23, xr9, xr2 SHUFB xr3, xr23, xr9, xr3 xvdp2.h.bu.b xr10, xr0, xr22 xvdp2.h.bu.b xr12, xr1, xr22 xvdp2.h.bu.b xr14, xr2, xr22 xvdp2.h.bu.b xr16, xr3, xr22 HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 HADDWDH xr14 //h2 HADDWDH xr16 //h3 xvpackev.w xr10, xr12, xr10 xvpackev.w xr14, xr16, xr14 xvpackev.d xr12, xr14, xr10 xvpackod.d xr10, xr14, xr10 xvpickev.h xr12, xr10, xr12 xvsrari.h xr12, xr12, 2 xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2) xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3) xvdp2.w.h xr0, xr18, xr8 xvdp2.w.h xr2, xr19, xr8 HADDWQW xr0 HADDWQW xr2 xvpackev.w xr0, xr2, xr0 xvbsrl.v xr18, xr18, 2 xvbsrl.v xr19, xr19, 2 xvextrins.h xr18, xr12, 0x71 xvextrins.h xr19, xr12, 0x75 xvdp2.w.h xr2, xr18, xr8 xvdp2.w.h xr4, xr19, xr8 HADDWQW xr2 HADDWQW xr4 xvpackev.w xr2, xr4, xr2 xvbsrl.v xr18, xr18, 2 xvbsrl.v xr19, xr19, 2 xvextrins.h xr18, xr12, 0x72 xvextrins.h xr19, xr12, 0x76 xvdp2.w.h xr4, xr18, xr8 xvdp2.w.h xr9, xr19, xr8 HADDWQW xr4 HADDWQW xr9 xvpackev.w xr4, xr9, xr4 xvbsrl.v xr18, xr18, 2 xvbsrl.v xr19, xr19, 2 xvextrins.h xr18, xr12, 0x73 xvextrins.h xr19, xr12, 0x77 xvdp2.w.h xr9, xr18, xr8 xvdp2.w.h xr11, xr19, xr8 HADDWQW xr9 HADDWQW xr11 xvpackev.w xr9, xr11, xr9 xvpackev.d xr0, xr2, xr0 xvpackev.d xr4, xr9, xr4 xvsrari.w xr0, xr0, 6 xvsrari.w xr4, xr4, 6 xvpermi.d xr0, xr0, 0xd8 xvpermi.d xr4, xr4, 0xd8 xvpickev.h xr0, xr4, xr0 xvpermi.d xr0, xr0, 0xd8 xvst xr0, a0, 0 addi.d a0, a0, 32 xvbsrl.v xr18, xr18, 2 xvbsrl.v xr19, xr19, 2 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv_w4_loop b .l_\lable\()end_pre_8tap .l_\lable\()hv_8w: addi.d t0, a1, 0 addi.d t5, a4, 0 srli.w t7, a3, 3 slli.w t7, t7, 4 // store offset addi.d t8, a0, 0 .l_\lable\()hv_8w_loop0: xvld xr0, a1, 0 xvldx xr2, a1, a2 xvldx xr4, a1, t2 xvldx xr6, a1, t3 add.d a1, a1, t4 xvld xr10, a1, 0 xvldx xr11, a1, a2 xvldx xr12, a1, t2 xvbsrl.v xr1, xr0, 4 xvbsrl.v xr3, xr2, 4 xvbsrl.v xr5, xr4, 4 xvbsrl.v xr7, xr6, 4 SHUFB xr0, xr23, xr9, xr13 SHUFB xr1, xr23, xr9, xr14 SHUFB xr2, xr23, xr9, xr15 SHUFB xr3, xr23, xr9, xr16 SHUFB xr4, xr23, xr9, xr17 SHUFB xr5, xr23, xr9, xr18 SHUFB xr6, xr23, xr9, xr19 SHUFB xr7, xr23, xr9, xr20 xvdp2.h.bu.b xr0, xr13, xr22 xvdp2.h.bu.b xr1, xr14, xr22 xvdp2.h.bu.b xr2, xr15, xr22 xvdp2.h.bu.b xr3, xr16, xr22 xvdp2.h.bu.b xr4, xr17, xr22 xvdp2.h.bu.b xr5, xr18, xr22 xvdp2.h.bu.b xr6, xr19, xr22 xvdp2.h.bu.b xr7, xr20, xr22 HADDWDH xr0 HADDWDH xr1 HADDWDH xr2 HADDWDH xr3 HADDWDH xr4 HADDWDH xr5 HADDWDH xr6 HADDWDH xr7 xvpackev.w xr0, xr2, xr0 xvpackev.w xr2, xr6, xr4 xvpackev.d xr16, xr2, xr0 xvpackod.d xr0, xr2, xr0 xvpickev.h xr0, xr0, xr16 xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 xvpackev.w xr1, xr3, xr1 xvpackev.w xr3, xr7, xr5 xvpackev.d xr16, xr3, xr1 xvpackod.d xr1, xr3, xr1 xvpickev.h xr1, xr1, xr16 xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 xvbsrl.v xr13, xr10, 4 xvbsrl.v xr14, xr11, 4 xvbsrl.v xr15, xr12, 4 SHUFB xr10, xr23, xr9, xr10 SHUFB xr13, xr23, xr9, xr13 SHUFB xr11, xr23, xr9, xr11 SHUFB xr14, xr23, xr9, xr14 SHUFB xr12, xr23, xr9, xr12 SHUFB xr15, xr23, xr9, xr15 xvdp2.h.bu.b xr4, xr10, xr22 xvdp2.h.bu.b xr5, xr13, xr22 xvdp2.h.bu.b xr6, xr11, xr22 xvdp2.h.bu.b xr7, xr14, xr22 xvdp2.h.bu.b xr9, xr12, xr22 xvdp2.h.bu.b xr10, xr15, xr22 HADDWDH xr4 HADDWDH xr5 HADDWDH xr6 HADDWDH xr7 HADDWDH xr9 HADDWDH xr10 xvpackev.w xr4, xr6, xr4 xvpackev.w xr9, xr12, xr9 xvpackev.d xr16, xr9, xr4 xvpackod.d xr11, xr9, xr4 xvpickev.h xr2, xr11, xr16 xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 * xvpackev.w xr5, xr7, xr5 xvpackev.w xr10, xr12, xr10 xvpackev.d xr16, xr10, xr5 xvpackod.d xr11, xr10, xr5 xvpickev.h xr3, xr11, xr16 xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 * xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 * xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 * xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 * xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 * .l_\lable\()hv_8w_loop: xvldx xr0, a1, t3 add.d a1, a1, t4 xvld xr2, a1, 0 xvldx xr4, a1, a2 xvldx xr6, a1, t2 xvbsrl.v xr1, xr0, 4 xvbsrl.v xr3, xr2, 4 xvbsrl.v xr5, xr4, 4 xvbsrl.v xr7, xr6, 4 SHUFB xr0, xr23, xr9, xr0 SHUFB xr1, xr23, xr9, xr1 SHUFB xr2, xr23, xr9, xr2 SHUFB xr3, xr23, xr9, xr3 SHUFB xr4, xr23, xr9, xr4 SHUFB xr5, xr23, xr9, xr5 SHUFB xr6, xr23, xr9, xr6 SHUFB xr7, xr23, xr9, xr7 xvdp2.h.bu.b xr10, xr0, xr22 xvdp2.h.bu.b xr11, xr1, xr22 xvdp2.h.bu.b xr12, xr2, xr22 xvdp2.h.bu.b xr13, xr3, xr22 xvdp2.h.bu.b xr14, xr4, xr22 xvdp2.h.bu.b xr15, xr5, xr22 xvdp2.h.bu.b xr16, xr6, xr22 xvdp2.h.bu.b xr17, xr7, xr22 HADDWDH xr10 HADDWDH xr11 HADDWDH xr12 HADDWDH xr13 HADDWDH xr14 HADDWDH xr15 HADDWDH xr16 HADDWDH xr17 xvpackev.w xr0, xr12, xr10 xvpackev.w xr2, xr16, xr14 xvpackev.d xr9, xr2, xr0 xvpackod.d xr0, xr2, xr0 xvpickev.h xr0, xr0, xr9 xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83 xvpackev.w xr1, xr13, xr11 xvpackev.w xr3, xr17, xr15 xvpackev.d xr9, xr3, xr1 xvpackod.d xr1, xr3, xr1 xvpickev.h xr1, xr1, xr9 xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87 xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58) xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59) xvextrins.h xr20, xr1, 0x70 xvextrins.h xr21, xr1, 0x74 //h - 1 xvdp2.w.h xr10, xr18, xr8 xvdp2.w.h xr11, xr19, xr8 xvdp2.w.h xr12, xr20, xr8 xvdp2.w.h xr13, xr21, xr8 HADDWQW xr10 HADDWQW xr11 HADDWQW xr12 HADDWQW xr13 xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * * xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * * xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7 //h - 2 xvbsrl.v xr4, xr18, 2 xvbsrl.v xr5, xr19, 2 xvbsrl.v xr6, xr20, 2 xvbsrl.v xr7, xr21, 2 xvextrins.h xr4, xr0, 0x71 xvextrins.h xr5, xr0, 0x75 xvextrins.h xr6, xr1, 0x71 xvextrins.h xr7, xr1, 0x75 xvdp2.w.h xr10, xr4, xr8 xvdp2.w.h xr11, xr5, xr8 xvdp2.w.h xr12, xr6, xr8 xvdp2.w.h xr13, xr7, xr8 HADDWQW xr10 HADDWQW xr11 HADDWQW xr12 HADDWQW xr13 xvpackev.w xr14, xr11, xr10 xvpackev.w xr15, xr13, xr12 xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15 //h - 3 xvbsrl.v xr4, xr4, 2 xvbsrl.v xr5, xr5, 2 xvbsrl.v xr6, xr6, 2 xvbsrl.v xr7, xr7, 2 xvextrins.h xr4, xr0, 0x72 xvextrins.h xr5, xr0, 0x76 xvextrins.h xr6, xr1, 0x72 xvextrins.h xr7, xr1, 0x76 xvdp2.w.h xr10, xr4, xr8 xvdp2.w.h xr11, xr5, xr8 xvdp2.w.h xr12, xr6, xr8 xvdp2.w.h xr13, xr7, xr8 HADDWQW xr10 HADDWQW xr11 HADDWQW xr12 HADDWQW xr13 xvpackev.w xr15, xr11, xr10 xvpackev.w xr16, xr13, xr12 xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23 //h - 4 xvbsrl.v xr4, xr4, 2 xvbsrl.v xr5, xr5, 2 xvbsrl.v xr6, xr6, 2 xvbsrl.v xr7, xr7, 2 xvextrins.h xr4, xr0, 0x73 xvextrins.h xr5, xr0, 0x77 xvextrins.h xr6, xr1, 0x73 xvextrins.h xr7, xr1, 0x77 xvdp2.w.h xr10, xr4, xr8 xvdp2.w.h xr11, xr5, xr8 xvdp2.w.h xr12, xr6, xr8 xvdp2.w.h xr13, xr7, xr8 HADDWQW xr10 HADDWQW xr11 HADDWQW xr12 HADDWQW xr13 xvpackev.w xr16, xr11, xr10 xvpackev.w xr17, xr13, xr12 xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31 xvsrari.w xr2, xr2, 6 xvsrari.w xr14, xr14, 6 xvsrari.w xr15, xr15, 6 xvsrari.w xr16, xr16, 6 xvpermi.d xr2, xr2, 0xd8 xvpermi.d xr14, xr14, 0xd8 xvpermi.d xr15, xr15, 0xd8 xvpermi.d xr16, xr16, 0xd8 xvpickev.h xr2, xr14, xr2 xvpickev.h xr3, xr16, xr15 xvpermi.d xr2, xr2, 0xd8 xvpermi.d xr3, xr3, 0xd8 xvpermi.q xr10, xr2, 0x31 xvpermi.q xr11, xr3, 0x31 vst vr2, a0, 0 vstx vr10, a0, t7 //32 slli.w t1, t7, 1 //64 vstx vr3, a0, t1 add.w t1, t1, t7 //96 vstx vr11, a0, t1 slli.w t1, t7, 2 //128 add.d a0, a0, t1 xvbsrl.v xr18, xr4, 2 xvbsrl.v xr19, xr5, 2 xvbsrl.v xr20, xr6, 2 xvbsrl.v xr21, xr7, 2 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv_8w_loop addi.d a1, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 16 addi.d t8, t8, 16 addi.d a4, t5, 0 addi.d a3, a3, -8 bnez a3, .l_\lable\()hv_8w_loop0 b .l_\lable\()end_pre_8tap .l_\lable\()v: srli.w a7, a7, 2 blt t0, a4, .l_\lable\()v_idx_fv andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()v_idx_fv: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset xvldrepl.d xr8, a7, 0 sub.d a1, a1, t3 beq a3, t0, .l_\lable\()v_4w blt t0, a3, .l_\lable\()v_8w .l_\lable\()v_4w: fld.s f0, a1, 0 fldx.s f1, a1, a2 fldx.s f2, a1, t2 add.d a1, a1, t3 fld.s f3, a1, 0 fldx.s f4, a1, a2 fldx.s f5, a1, t2 fldx.s f6, a1, t3 xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 xvilvl.w xr2, xr1, xr0 xvilvh.w xr0, xr1, xr0 xvpermi.q xr0, xr2, 0x20 .l_\lable\()v_4w_loop: add.d a1, a1, t4 fld.s f7, a1, 0 //h0 fldx.s f10, a1, a2 //h1 fldx.s f11, a1, t2 //h2 fldx.s f12, a1, t3 //h3 xvbsrl.v xr9, xr7, 2 xvpermi.q xr9, xr7, 0x20 xvextrins.b xr0, xr9, 0x70 xvextrins.b xr0, xr9, 0xf1 xvbsrl.v xr1, xr0, 1 xvbsrl.v xr7, xr10, 2 xvpermi.q xr7, xr10, 0x20 xvextrins.b xr1, xr7, 0x70 xvextrins.b xr1, xr7, 0xf1 xvbsrl.v xr2, xr1, 1 xvbsrl.v xr7, xr11, 2 xvpermi.q xr7, xr11, 0x20 xvextrins.b xr2, xr7, 0x70 xvextrins.b xr2, xr7, 0xf1 xvbsrl.v xr3, xr2, 1 xvbsrl.v xr7, xr12, 2 xvpermi.q xr7, xr12, 0x20 xvextrins.b xr3, xr7, 0x70 xvextrins.b xr3, xr7, 0xf1 xvbsrl.v xr4, xr3, 1 xvdp2.h.bu.b xr10, xr0, xr8 xvdp2.h.bu.b xr11, xr1, xr8 xvdp2.h.bu.b xr12, xr2, xr8 xvdp2.h.bu.b xr13, xr3, xr8 HADDWDH xr10 HADDWDH xr11 HADDWDH xr12 HADDWDH xr13 xvpickev.w xr10, xr11, xr10 xvpickev.w xr11, xr13, xr12 xvpermi.d xr10, xr10, 0xd8 xvpermi.d xr11, xr11, 0xd8 xvpickev.h xr10, xr11, xr10 xvpermi.d xr10, xr10, 0xd8 xvsrari.h xr10, xr10, 2 xvaddi.bu xr0, xr4, 0 xvst xr10, a0, 0 addi.d a0, a0, 32 addi.w a4, a4, -4 bnez a4, .l_\lable\()v_4w_loop b .l_\lable\()end_pre_8tap .l_\lable\()v_8w: addi.d t0, a1, 0 addi.d t5, a4, 0 srli.w t7, a3, 2 slli.w t7, t7, 3 addi.d t8, a0, 0 .l_\lable\()v_8w_loop0: fld.s f0, a1, 0 fldx.s f1, a1, a2 fldx.s f2, a1, t2 add.d a1, a1, t3 fld.s f3, a1, 0 fldx.s f4, a1, a2 fldx.s f5, a1, t2 fldx.s f6, a1, t3 xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 xvilvl.w xr2, xr1, xr0 xvilvh.w xr0, xr1, xr0 xvpermi.q xr0, xr2, 0x20 .l_\lable\()v_8w_loop: add.d a1, a1, t4 fld.s f7, a1, 0 //h0 fldx.s f10, a1, a2 //h1 fldx.s f11, a1, t2 //h2 fldx.s f12, a1, t3 //h3 xvbsrl.v xr9, xr7, 2 xvpermi.q xr9, xr7, 0x20 xvextrins.b xr0, xr9, 0x70 xvextrins.b xr0, xr9, 0xf1 xvbsrl.v xr1, xr0, 1 xvbsrl.v xr7, xr10, 2 xvpermi.q xr7, xr10, 0x20 xvextrins.b xr1, xr7, 0x70 xvextrins.b xr1, xr7, 0xf1 xvbsrl.v xr2, xr1, 1 xvbsrl.v xr7, xr11, 2 xvpermi.q xr7, xr11, 0x20 xvextrins.b xr2, xr7, 0x70 xvextrins.b xr2, xr7, 0xf1 xvbsrl.v xr3, xr2, 1 xvbsrl.v xr7, xr12, 2 xvpermi.q xr7, xr12, 0x20 xvextrins.b xr3, xr7, 0x70 xvextrins.b xr3, xr7, 0xf1 xvbsrl.v xr4, xr3, 1 xvdp2.h.bu.b xr10, xr0, xr8 xvdp2.h.bu.b xr11, xr1, xr8 xvdp2.h.bu.b xr12, xr2, xr8 xvdp2.h.bu.b xr13, xr3, xr8 HADDWDH xr10 HADDWDH xr11 HADDWDH xr12 HADDWDH xr13 xvpickev.w xr10, xr11, xr10 xvpickev.w xr11, xr13, xr12 xvpermi.d xr10, xr10, 0xd8 xvpermi.d xr11, xr11, 0xd8 xvpickev.h xr10, xr11, xr10 xvpermi.d xr10, xr10, 0xd8 xvsrari.h xr10, xr10, 2 xvaddi.bu xr0, xr4, 0 xvstelm.d xr10, a0, 0, 0 add.d a0, a0, t7 xvstelm.d xr10, a0, 0, 1 add.d a0, a0, t7 xvstelm.d xr10, a0, 0, 2 add.d a0, a0, t7 xvstelm.d xr10, a0, 0, 3 add.d a0, a0, t7 addi.w a4, a4, -4 bnez a4, .l_\lable\()v_8w_loop addi.d a1, t0, 4 addi.d t0, t0, 4 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.d a4, t5, 0 addi.d a3, a3, -4 bnez a3, .l_\lable\()v_8w_loop0 .l_\lable\()end_pre_8tap: .endm function prep_8tap_regular_8bpc_lasx addi.w a7, zero, 0 PREP_8TAP_8BPC_LASX 0 endfunc function prep_8tap_smooth_regular_8bpc_lasx addi.w a7, zero, 1 PREP_8TAP_8BPC_LASX 1 endfunc function prep_8tap_sharp_regular_8bpc_lasx addi.w a7, zero, 2 PREP_8TAP_8BPC_LASX 2 endfunc function prep_8tap_regular_smooth_8bpc_lasx addi.w a7, zero, 4 PREP_8TAP_8BPC_LASX 4 endfunc function prep_8tap_smooth_8bpc_lasx addi.w a7, zero, 5 PREP_8TAP_8BPC_LASX 5 endfunc function prep_8tap_sharp_smooth_8bpc_lasx addi.w a7, zero, 6 PREP_8TAP_8BPC_LASX 6 endfunc function prep_8tap_regular_sharp_8bpc_lasx addi.w a7, zero, 8 PREP_8TAP_8BPC_LASX 8 endfunc function prep_8tap_smooth_sharp_8bpc_lasx addi.w a7, zero, 9 PREP_8TAP_8BPC_LASX 9 endfunc function prep_8tap_sharp_8bpc_lasx addi.w a7, zero, 10 PREP_8TAP_8BPC_LASX 10 endfunc