/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" #define REST_UNIT_STRIDE (400) .macro MADD_HU_BU in0, in1, out0, out1 vsllwil.hu.bu vr12, \in0, 0 vexth.hu.bu vr13, \in0 vmadd.h \out0, vr12, \in1 vmadd.h \out1, vr13, \in1 .endm const wiener_shuf .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 endconst /* void wiener_filter_h_lsx(int32_t *hor_ptr, uint8_t *tmp_ptr, const int16_t filterh[8], const int w, const int h) */ function wiener_filter_h_8bpc_lsx addi.d sp, sp, -40 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 li.w t7, 1<<14 // clip_limit la.local t1, wiener_shuf vld vr4, t1, 0 vld vr14, a2, 0 // filter[0][k] vreplvei.h vr21, vr14, 0 vreplvei.h vr22, vr14, 1 vreplvei.h vr23, vr14, 2 vreplvei.h vr24, vr14, 3 vreplvei.h vr25, vr14, 4 vreplvei.h vr26, vr14, 5 vreplvei.h vr27, vr14, 6 vreplgr2vr.w vr0, t7 .WIENER_FILTER_H_H: addi.w a4, a4, -1 // h addi.w t0, a3, 0 // w addi.d t1, a1, 0 // tmp_ptr addi.d t2, a0, 0 // hor_ptr .WIENER_FILTER_H_W: addi.w t0, t0, -16 vld vr5, t1, 0 vld vr13, t1, 16 vsubi.bu vr14, vr4, 2 vsubi.bu vr15, vr4, 1 vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16 vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17 vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18 vaddi.bu vr14, vr4, 1 vaddi.bu vr15, vr4, 2 vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19 vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20 vaddi.bu vr14, vr4, 3 vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21 vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10 vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18 vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6 vexth.wu.hu vr18, vr15 // 7 8 9 10 vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14 vexth.wu.hu vr20, vr16 // 15 16 17 18 vslli.w vr17, vr17, 7 vslli.w vr18, vr18, 7 vslli.w vr19, vr19, 7 vslli.w vr20, vr20, 7 vxor.v vr15, vr15, vr15 vxor.v vr14, vr14, vr14 MADD_HU_BU vr5, vr21, vr14, vr15 MADD_HU_BU vr6, vr22, vr14, vr15 MADD_HU_BU vr7, vr23, vr14, vr15 MADD_HU_BU vr8, vr24, vr14, vr15 MADD_HU_BU vr9, vr25, vr14, vr15 MADD_HU_BU vr10, vr26, vr14, vr15 MADD_HU_BU vr11, vr27, vr14, vr15 vsllwil.w.h vr5, vr14, 0 // 0 1 2 3 vexth.w.h vr6, vr14 // 4 5 6 7 vsllwil.w.h vr7, vr15, 0 // 8 9 10 11 vexth.w.h vr8, vr15 // 12 13 14 15 vadd.w vr17, vr17, vr5 vadd.w vr18, vr18, vr6 vadd.w vr19, vr19, vr7 vadd.w vr20, vr20, vr8 vadd.w vr17, vr17, vr0 vadd.w vr18, vr18, vr0 vadd.w vr19, vr19, vr0 vadd.w vr20, vr20, vr0 vsrli.w vr1, vr0, 1 vsubi.wu vr1, vr1, 1 vxor.v vr3, vr3, vr3 vsrari.w vr17, vr17, 3 vsrari.w vr18, vr18, 3 vsrari.w vr19, vr19, 3 vsrari.w vr20, vr20, 3 vclip.w vr17, vr17, vr3, vr1 vclip.w vr18, vr18, vr3, vr1 vclip.w vr19, vr19, vr3, vr1 vclip.w vr20, vr20, vr3, vr1 vst vr17, t2, 0 vst vr18, t2, 16 vst vr19, t2, 32 vst vr20, t2, 48 addi.d t1, t1, 16 addi.d t2, t2, 64 blt zero, t0, .WIENER_FILTER_H_W addi.d a1, a1, REST_UNIT_STRIDE addi.d a0, a0, (REST_UNIT_STRIDE << 2) bnez a4, .WIENER_FILTER_H_H fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 addi.d sp, sp, 40 endfunc .macro APPLY_FILTER in0, in1, in2 alsl.d t7, \in0, \in1, 2 vld vr10, t7, 0 vld vr11, t7, 16 vld vr12, t7, 32 vld vr13, t7, 48 vmadd.w vr14, vr10, \in2 vmadd.w vr15, vr11, \in2 vmadd.w vr16, vr12, \in2 vmadd.w vr17, vr13, \in2 .endm .macro wiener_filter_v_8bpc_core_lsx vreplgr2vr.w vr14, t6 vreplgr2vr.w vr15, t6 vreplgr2vr.w vr16, t6 vreplgr2vr.w vr17, t6 addi.w t7, t2, 0 // j + index k mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i APPLY_FILTER t7, a2, vr2 APPLY_FILTER t8, t7, vr3 APPLY_FILTER t8, t7, vr4 APPLY_FILTER t8, t7, vr5 APPLY_FILTER t8, t7, vr6 APPLY_FILTER t8, t7, vr7 APPLY_FILTER t8, t7, vr8 vssrarni.hu.w vr15, vr14, 11 vssrarni.hu.w vr17, vr16, 11 vssrlni.bu.h vr17, vr15, 0 .endm /* void wiener_filter_v_lsx(uint8_t *p, const ptrdiff_t p_stride, const int32_t *hor, const int16_t filterv[8], const int w, const int h) */ function wiener_filter_v_8bpc_lsx li.w t6, -(1 << 18) li.w t8, REST_UNIT_STRIDE ld.h t0, a3, 0 ld.h t1, a3, 2 vreplgr2vr.w vr2, t0 vreplgr2vr.w vr3, t1 ld.h t0, a3, 4 ld.h t1, a3, 6 vreplgr2vr.w vr4, t0 vreplgr2vr.w vr5, t1 ld.h t0, a3, 8 ld.h t1, a3, 10 vreplgr2vr.w vr6, t0 vreplgr2vr.w vr7, t1 ld.h t0, a3, 12 vreplgr2vr.w vr8, t0 andi t1, a4, 0xf sub.w t0, a4, t1 // w-w%16 or t2, zero, zero // j or t4, zero, zero beqz t0, .WIENER_FILTER_V_W_LT16 .WIENER_FILTER_V_H: andi t1, a4, 0xf add.d t3, zero, a0 // p or t4, zero, zero // i .WIENER_FILTER_V_W: wiener_filter_v_8bpc_core_lsx mul.w t5, t2, a1 // j * stride add.w t5, t5, t4 // j * stride + i add.d t3, a0, t5 addi.w t4, t4, 16 vst vr17, t3, 0 bne t0, t4, .WIENER_FILTER_V_W beqz t1, .WIENER_FILTER_V_W_EQ16 wiener_filter_v_8bpc_core_lsx addi.d t3, t3, 16 andi t1, a4, 0xf .WIENER_FILTER_V_ST_REM: vstelm.b vr17, t3, 0, 0 vbsrl.v vr17, vr17, 1 addi.d t3, t3, 1 addi.w t1, t1, -1 bnez t1, .WIENER_FILTER_V_ST_REM .WIENER_FILTER_V_W_EQ16: addi.w t2, t2, 1 blt t2, a5, .WIENER_FILTER_V_H b .WIENER_FILTER_V_END .WIENER_FILTER_V_W_LT16: andi t1, a4, 0xf add.d t3, zero, a0 wiener_filter_v_8bpc_core_lsx mul.w t5, t2, a1 // j * stride add.d t3, a0, t5 .WIENER_FILTER_V_ST_REM_1: vstelm.b vr17, t3, 0, 0 vbsrl.v vr17, vr17, 1 addi.d t3, t3, 1 addi.w t1, t1, -1 bnez t1, .WIENER_FILTER_V_ST_REM_1 addi.w t2, t2, 1 blt t2, a5, .WIENER_FILTER_V_W_LT16 .WIENER_FILTER_V_END: endfunc /* void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src, const int w, const int h) */ function boxsum3_h_8bpc_lsx addi.d a2, a2, REST_UNIT_STRIDE li.w t0, 1 addi.w a3, a3, -2 addi.w a4, a4, -4 .LBS3_H_H: alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x add.d t3, t0, a2 // s addi.w t5, a3, 0 .LBS3_H_W: vld vr0, t3, 0 vld vr1, t3, REST_UNIT_STRIDE vld vr2, t3, (REST_UNIT_STRIDE<<1) vilvl.b vr3, vr1, vr0 vhaddw.hu.bu vr4, vr3, vr3 vilvh.b vr5, vr1, vr0 vhaddw.hu.bu vr6, vr5, vr5 vsllwil.hu.bu vr7, vr2, 0 vexth.hu.bu vr8, vr2 // sum_v vadd.h vr4, vr4, vr7 vadd.h vr6, vr6, vr8 vst vr4, t1, REST_UNIT_STRIDE<<1 vst vr6, t1, (REST_UNIT_STRIDE<<1)+16 addi.d t1, t1, 32 // sumsq vmulwev.h.bu vr9, vr3, vr3 vmulwod.h.bu vr10, vr3, vr3 vmulwev.h.bu vr11, vr5, vr5 vmulwod.h.bu vr12, vr5, vr5 vmul.h vr7, vr7, vr7 vmul.h vr8, vr8, vr8 vaddwev.w.hu vr13, vr10, vr9 vaddwod.w.hu vr14, vr10, vr9 vilvl.w vr3, vr14, vr13 vilvh.w vr4, vr14, vr13 vaddwev.w.hu vr13, vr12, vr11 vaddwod.w.hu vr14, vr12, vr11 vilvl.w vr15, vr14, vr13 vilvh.w vr16, vr14, vr13 vsllwil.wu.hu vr9, vr7, 0 vexth.wu.hu vr10, vr7 vsllwil.wu.hu vr11, vr8, 0 vexth.wu.hu vr12, vr8 vadd.w vr9, vr9, vr3 vadd.w vr10, vr10, vr4 vadd.w vr11, vr11, vr15 vadd.w vr12, vr12, vr16 vst vr9, t2, REST_UNIT_STRIDE<<2 vst vr10, t2, (REST_UNIT_STRIDE<<2)+16 vst vr11, t2, (REST_UNIT_STRIDE<<2)+32 vst vr12, t2, (REST_UNIT_STRIDE<<2)+48 addi.d t2, t2, 64 addi.w t5, t5, -16 addi.d t3, t3, 16 blt zero, t5, .LBS3_H_W addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a2, a2, REST_UNIT_STRIDE addi.d a4, a4, -1 blt zero, a4, .LBS3_H_H .LBS3_H_END: endfunc /* void boxsum3_v(int32_t *sumsq, coef *sum, const int w, const int h) */ function boxsum3_v_8bpc_lsx addi.d a0, a0, (REST_UNIT_STRIDE<<2) addi.d a1, a1, (REST_UNIT_STRIDE<<1) addi.w a3, a3, -4 addi.w a2, a2, -4 .LBS3_V_H: sub.w t3, a2, zero addi.d t0, a0, 4 addi.d t1, a1, 2 addi.d t5, a0, 8 addi.d t6, a1, 4 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 vld vr3, t0, 0 // a2 0 1 2 3 vld vr4, t0, 4 // b2 1 2 3 4 vld vr5, t0, 8 // c2 2 3 4 5 vld vr6, t0, 16 // 3 4 5 6 vld vr7, t0, 20 // 4 5 6 7 vld vr8, t0, 24 // 5 6 7 8 vadd.h vr9, vr0, vr1 vadd.h vr9, vr9, vr2 vadd.w vr10, vr3, vr4 vadd.w vr10, vr10, vr5 vadd.w vr11, vr6, vr7 vadd.w vr11, vr11, vr8 vpickve2gr.h t7, vr2, 6 vpickve2gr.w t8, vr8, 2 vst vr9, t6, 0 vst vr10, t5, 0 vst vr11, t5, 16 addi.d t1, t1, 16 addi.d t0, t0, 32 addi.d t5, t5, 32 addi.d t6, t6, 16 addi.d t3, t3, -8 ble t3, zero, .LBS3_V_H0 .LBS3_V_W8: vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 vld vr3, t0, 0 // a2 0 1 2 3 vld vr4, t0, 4 // b2 1 2 3 4 vld vr5, t0, 8 // c2 2 3 4 5 vld vr6, t0, 16 // 3 4 5 6 vld vr7, t0, 20 // 4 5 6 7 vld vr8, t0, 24 // 5 6 7 8 vinsgr2vr.h vr0, t7, 0 vinsgr2vr.w vr3, t8, 0 vpickve2gr.h t7, vr2, 6 vpickve2gr.w t8, vr8, 2 vadd.h vr9, vr0, vr1 vadd.w vr10, vr3, vr4 vadd.w vr11, vr6, vr7 vadd.h vr9, vr9, vr2 vadd.w vr10, vr10, vr5 vadd.w vr11, vr11, vr8 vst vr9, t6, 0 vst vr10, t5, 0 vst vr11, t5, 16 addi.d t3, t3, -8 addi.d t1, t1, 16 addi.d t0, t0, 32 addi.d t5, t5, 32 addi.d t6, t6, 16 blt zero, t3, .LBS3_V_W8 .LBS3_V_H0: addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.w a3, a3, -1 bnez a3, .LBS3_V_H .LBS3_V_END: endfunc /* boxsum3_selfguided_filter(int32_t *sumsq, coef *sum, const int w, const int h, const unsigned s) */ function boxsum3_sgf_h_8bpc_lsx addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a0, a0, 12 // AA addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a1, a1, 6 // BB la.local t8, dav1d_sgr_x_by_x li.w t6, 455 vreplgr2vr.w vr20, t6 li.w t6, 255 vreplgr2vr.w vr22, t6 vaddi.wu vr21, vr22, 1 // 256 vreplgr2vr.w vr6, a4 vldi vr19, 0x809 addi.w a2, a2, 2 // w + 2 addi.w a3, a3, 2 // h + 2 .LBS3SGF_H_H: addi.w t2, a2, 0 addi.d t0, a0, -4 addi.d t1, a1, -2 .LBS3SGF_H_W: addi.w t2, t2, -8 vld vr0, t0, 0 // AA[i] vld vr1, t0, 16 vld vr2, t1, 0 // BB[i] vmul.w vr4, vr0, vr19 // a * n vmul.w vr5, vr1, vr19 // a * n vsllwil.w.h vr9, vr2, 0 vexth.w.h vr10, vr2 vmsub.w vr4, vr9, vr9 // p vmsub.w vr5, vr10, vr10 // p vmaxi.w vr4, vr4, 0 vmaxi.w vr5, vr5, 0 // p vmul.w vr4, vr4, vr6 // p * s vmul.w vr5, vr5, vr6 // p * s vsrlri.w vr4, vr4, 20 vsrlri.w vr5, vr5, 20 // z vmin.w vr4, vr4, vr22 vmin.w vr5, vr5, vr22 vpickve2gr.w t6, vr4, 0 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 0 vpickve2gr.w t6, vr4, 1 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 1 vpickve2gr.w t6, vr4, 2 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 2 vpickve2gr.w t6, vr4, 3 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 3 vpickve2gr.w t6, vr5, 0 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 0 vpickve2gr.w t6, vr5, 1 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 1 vpickve2gr.w t6, vr5, 2 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 2 vpickve2gr.w t6, vr5, 3 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 3 // x vmul.w vr9, vr7, vr9 // x * BB[i] vmul.w vr10, vr8, vr10 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x vmul.w vr10, vr10, vr20 vsrlri.w vr9, vr9, 12 vsrlri.w vr10, vr10, 12 vsub.w vr7, vr21, vr7 vsub.w vr8, vr21, vr8 vpickev.h vr8, vr8, vr7 vst vr9, t0, 0 vst vr10, t0, 16 vst vr8, t1, 0 addi.d t0, t0, 32 addi.d t1, t1, 16 blt zero, t2, .LBS3SGF_H_W addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.w a3, a3, -1 bnez a3, .LBS3SGF_H_H endfunc /* boxsum3_selfguided_filter(coef *dst, pixel *src, int32_t *sumsq, coef *sum, const int w, const int h) */ function boxsum3_sgf_v_8bpc_lsx addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src addi.d a2, a2, REST_UNIT_STRIDE<<2 addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 addi.d a3, a3, REST_UNIT_STRIDE<<2 addi.d a3, a3, 6 .LBS3SGF_V_H: // A int32_t *sumsq addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride addi.d t1, a2, 0 // sumsq addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride addi.d t6, a1, 0 addi.w t7, a4, 0 addi.d t8, a0, 0 // B coef *sum addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride addi.d t4, a3, 0 addi.d t5, a3, REST_UNIT_STRIDE<<1 .LBS3SGF_V_W: vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE] vld vr1, t0, 16 vld vr2, t1, -4 // P[i-1] vld vr3, t1, 12 vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE] vld vr5, t2, 16 vld vr6, t1, 0 // p[i] vld vr7, t1, 16 vld vr8, t1, 4 // p[i+1] vld vr9, t1, 20 vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] vld vr11, t0, 12 vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] vld vr13, t2, 12 vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] vld vr15, t0, 20 vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] vld vr17, t2, 20 vadd.w vr0, vr2, vr0 vadd.w vr4, vr6, vr4 vadd.w vr0, vr0, vr8 vadd.w vr20, vr0, vr4 vslli.w vr20, vr20, 2 // 0 1 2 3 vadd.w vr0, vr1, vr3 vadd.w vr4, vr5, vr7 vadd.w vr0, vr0, vr9 vadd.w vr21, vr0, vr4 vslli.w vr21, vr21, 2 // 4 5 6 7 vadd.w vr12, vr10, vr12 vadd.w vr16, vr14, vr16 vadd.w vr22, vr12, vr16 vslli.w vr23, vr22, 1 vadd.w vr22, vr23, vr22 vadd.w vr11, vr11, vr13 vadd.w vr15, vr15, vr17 vadd.w vr0, vr11, vr15 vslli.w vr23, vr0, 1 vadd.w vr23, vr23, vr0 vadd.w vr20, vr20, vr22 // b vadd.w vr21, vr21, vr23 // B coef *sum vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE] vld vr1, t4, -2 // p[i - 1] vld vr2, t4, 0 // p[i] vld vr3, t4, 2 // p[i + 1] vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE] vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] vaddwev.w.h vr9, vr0, vr1 vaddwod.w.h vr10, vr0, vr1 vaddwev.w.h vr11, vr2, vr3 vaddwod.w.h vr12, vr2, vr3 vadd.w vr9, vr11, vr9 vadd.w vr10, vr12, vr10 vilvl.w vr11, vr10, vr9 // 0 1 2 3 vilvh.w vr12, vr10, vr9 // 4 5 6 7 vsllwil.w.h vr0, vr4, 0 vexth.w.h vr1, vr4 vadd.w vr0, vr11, vr0 vadd.w vr1, vr12, vr1 vslli.w vr0, vr0, 2 vslli.w vr1, vr1, 2 vaddwev.w.h vr9, vr5, vr6 vaddwod.w.h vr10, vr5, vr6 vaddwev.w.h vr11, vr7, vr8 vaddwod.w.h vr12, vr7, vr8 vadd.w vr9, vr11, vr9 vadd.w vr10, vr12, vr10 vilvl.w vr13, vr10, vr9 vilvh.w vr14, vr10, vr9 vslli.w vr15, vr13, 1 vslli.w vr16, vr14, 1 vadd.w vr15, vr13, vr15 // a vadd.w vr16, vr14, vr16 vadd.w vr22, vr0, vr15 vadd.w vr23, vr1, vr16 vld vr0, t6, 0 // src vsllwil.hu.bu vr0, vr0, 0 vsllwil.wu.hu vr1, vr0, 0 vexth.wu.hu vr2, vr0 vmadd.w vr20, vr22, vr1 vmadd.w vr21, vr23, vr2 vssrlrni.h.w vr21, vr20, 9 vst vr21, t8, 0 addi.d t8, t8, 16 addi.d t0, t0, 32 addi.d t1, t1, 32 addi.d t2, t2, 32 addi.d t3, t3, 16 addi.d t4, t4, 16 addi.d t5, t5, 16 addi.d t6, t6, 8 addi.w t7, t7, -8 blt zero, t7, .LBS3SGF_V_W addi.w a5, a5, -1 addi.d a0, a0, 384*2 addi.d a1, a1, REST_UNIT_STRIDE addi.d a3, a3, REST_UNIT_STRIDE<<1 addi.d a2, a2, REST_UNIT_STRIDE<<2 bnez a5, .LBS3SGF_V_H endfunc #define FILTER_OUT_STRIDE (384) /* sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride, const int16_t *dst, const int w1; const int w, const int h); */ function sgr_3x3_finish_8bpc_lsx vreplgr2vr.w vr3, a3 // w1 andi t4, a4, 0x7 sub.w t5, a4, t4 beq zero, t5, .LSGR3X3_REM .LSGR3X3_H: addi.d t0, a0, 0 addi.d t1, a2, 0 addi.w t2, t5, 0 andi t4, a4, 0x7 .LSGR3X3_W: vld vr0, t0, 0 vld vr1, t1, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // p vexth.wu.hu vr5, vr2 // p vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst vexth.w.h vr9, vr1 // dst vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 vstelm.d vr7, t0, 0, 0 addi.d t0, t0, 8 addi.d t1, t1, 16 addi.d t2, t2, -8 bne zero, t2, .LSGR3X3_W beq t4, zero, .LSGR3X3_NOREM vld vr0, t0, 0 vld vr1, t1, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // p vexth.wu.hu vr5, vr2 // p vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst vexth.w.h vr9, vr1 // dst vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 .LSGR3X3_ST: vstelm.b vr7, t0, 0, 0 addi.d t0, t0, 1 vbsrl.v vr7, vr7, 1 addi.w t4, t4, -1 bnez t4, .LSGR3X3_ST .LSGR3X3_NOREM: addi.w a5, a5, -1 add.d a0, a0, a1 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) bnez a5, .LSGR3X3_H b .LSGR3X3_END .LSGR3X3_REM: andi t4, a4, 0x7 addi.d t0, a0, 0 vld vr0, t0, 0 vld vr1, a2, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // p vexth.wu.hu vr5, vr2 // p vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst vexth.w.h vr9, vr1 // dst vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 .LSGR3X3_REM_ST: vstelm.b vr7, t0, 0, 0 addi.d t0, t0, 1 vbsrl.v vr7, vr7, 1 addi.w t4, t4, -1 bnez t4, .LSGR3X3_REM_ST addi.w a5, a5, -1 add.d a0, a0, a1 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) bnez a5, .LSGR3X3_REM .LSGR3X3_END: endfunc /* void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src, const int w, const int h) */ function boxsum5_h_8bpc_lsx addi.w a4, a4, -4 addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a1, a1, REST_UNIT_STRIDE<<1 li.w t6, 1 .LBOXSUM5_H_H: addi.w t3, a3, 0 addi.d t2, a2, 0 addi.d t0, a0, 0 addi.d t1, a1, 0 .LBOXSUM5_H_W: vld vr0, t2, 0 // a vld vr1, t2, REST_UNIT_STRIDE // b vld vr2, t2, REST_UNIT_STRIDE<<1 // c vld vr3, t2, REST_UNIT_STRIDE*3 // d vld vr4, t2, REST_UNIT_STRIDE<<2 // e vilvl.b vr5, vr1, vr0 vilvh.b vr6, vr1, vr0 vilvl.b vr7, vr3, vr2 vilvh.b vr8, vr3, vr2 //sum_v vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7 vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b vhaddw.hu.bu vr11, vr7, vr7 vhaddw.hu.bu vr12, vr8, vr8 vadd.h vr9, vr9, vr11 vadd.h vr10, vr10, vr12 // a + b + c + d vsllwil.hu.bu vr11, vr4, 0 vexth.hu.bu vr12, vr4 vadd.h vr9, vr9, vr11 vadd.h vr10, vr10, vr12 vst vr9, t1, 0 vst vr10, t1, 16 addi.d t1, t1, 32 // sumsq vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7 vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15 vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7 vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15 vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7 vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15 vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7 vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15 vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6 vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7 vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14 vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6 vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7 vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14 vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d vadd.w vr5, vr5, vr19 vadd.w vr6, vr6, vr20 vadd.w vr7, vr7, vr21 vadd.w vr8, vr8, vr22 vilvl.w vr19, vr6, vr5 vilvh.w vr20, vr6, vr5 vilvl.w vr21, vr8, vr7 vilvh.w vr22, vr8, vr7 vmul.h vr11, vr11, vr11 vmul.h vr12, vr12, vr12 vsllwil.wu.hu vr0, vr11, 0 vexth.wu.hu vr1, vr11 vsllwil.wu.hu vr2, vr12, 0 vexth.wu.hu vr3, vr12 vadd.w vr19, vr19, vr0 vadd.w vr20, vr20, vr1 vadd.w vr21, vr21, vr2 vadd.w vr22, vr22, vr3 vst vr19, t0, 0 vst vr20, t0, 16 vst vr21, t0, 32 vst vr22, t0, 48 addi.d t0, t0, 64 addi.d t2, t2, 16 addi.w t3, t3, -16 blt zero, t3, .LBOXSUM5_H_W addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a2, a2, REST_UNIT_STRIDE addi.d a4, a4, -1 bnez a4, .LBOXSUM5_H_H endfunc /* void boxsum5_h(int32_t *sumsq, coef *sum, const int w, const int h) */ function boxsum5_v_8bpc_lsx addi.d a0, a0, (REST_UNIT_STRIDE<<2) addi.d a1, a1, (REST_UNIT_STRIDE<<1) addi.w a3, a3, -4 addi.w a2, a2, -4 .LBOXSUM5_V_H: addi.w t3, a2, 0 addi.d t0, a0, 0 addi.d t1, a1, 0 addi.d t2, a0, 8 addi.d t3, a1, 4 addi.d t4, a2, 0 vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 vld vr2, t1, 4 // c 2 vld vr3, t1, 6 // d 3 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 vadd.h vr5, vr0, vr1 vadd.h vr6, vr2, vr3 vpickve2gr.w t5, vr4, 2 vadd.h vr5, vr5, vr6 vadd.h vr5, vr5, vr4 vst vr5, t3, 0 vld vr0, t0, 0 // 0 1 2 3 a vld vr1, t0, 4 // 1 2 3 4 b vld vr2, t0, 8 // 2 3 4 5 c vld vr3, t0, 12 // 3 4 5 6 d vld vr4, t0, 16 // 4 5 6 7 e a vld vr5, t0, 20 // 5 6 7 8 b vld vr6, t0, 24 // 6 7 8 9 c vld vr7, t0, 28 // 7 8 9 10 d vld vr8, t0, 32 // 8 9 10 11 e vadd.w vr9, vr0, vr1 vadd.w vr10, vr2, vr3 vadd.w vr9, vr9, vr10 vadd.w vr9, vr9, vr4 vadd.w vr10, vr4, vr5 vadd.w vr11, vr6, vr7 vadd.w vr10, vr10, vr8 vadd.w vr10, vr10, vr11 vst vr9, t2, 0 vst vr10, t2, 16 addi.d t3, t3, 16 addi.d t1, t1, 16 addi.d t0, t0, 32 addi.d t2, t2, 32 addi.w t4, t4, -8 ble t4, zero, .LBOXSUM5_V_H1 .LBOXSUM5_V_W: vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 vld vr2, t1, 4 // c 2 vld vr3, t1, 6 // d 3 vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 vinsgr2vr.w vr0, t5, 0 vpickve2gr.w t5, vr4, 2 vextrins.h vr1, vr0, 0x01 vadd.h vr5, vr0, vr1 vadd.h vr6, vr2, vr3 vadd.h vr5, vr5, vr6 vadd.h vr5, vr5, vr4 vst vr5, t3, 0 vaddi.hu vr0, vr8, 0 // 8 9 10 11 a vld vr1, t0, 4 // 9 10 11 12 b vld vr2, t0, 8 // 10 11 12 13 c vld vr3, t0, 12 // 14 15 16 17 d vld vr4, t0, 16 // 15 16 17 18 e a vld vr5, t0, 20 // 16 17 18 19 b vld vr6, t0, 24 // 17 18 19 20 c vld vr7, t0, 28 // 18 19 20 21 d vld vr8, t0, 32 // 19 20 21 22 e vextrins.w vr1, vr0, 0x01 vadd.w vr9, vr0, vr1 vadd.w vr10, vr2, vr3 vadd.w vr9, vr9, vr10 vadd.w vr9, vr9, vr4 vadd.w vr10, vr4, vr5 vadd.w vr11, vr6, vr7 vadd.w vr10, vr10, vr8 vadd.w vr10, vr10, vr11 vst vr9, t2, 0 vst vr10, t2, 16 addi.d t3, t3, 16 addi.d t1, t1, 16 addi.d t0, t0, 32 addi.d t2, t2, 32 addi.w t4, t4, -8 blt zero, t4, .LBOXSUM5_V_W .LBOXSUM5_V_H1: addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.w a3, a3, -1 bnez a3, .LBOXSUM5_V_H endfunc /* selfguided_filter(int32_t *sumsq, coef *sum, const int w, const int h, const unsigned s) */ function boxsum5_sgf_h_8bpc_lsx addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a0, a0, 12 // AA addi.d a1, a1, REST_UNIT_STRIDE<<1 addi.d a1, a1, 6 // BB la.local t8, dav1d_sgr_x_by_x li.w t6, 164 vreplgr2vr.w vr20, t6 li.w t6, 255 vreplgr2vr.w vr22, t6 vaddi.wu vr21, vr22, 1 // 256 vreplgr2vr.w vr6, a4 vldi vr19, 0x819 addi.w a2, a2, 2 // w + 2 addi.w a3, a3, 2 // h + 2 .LBS5SGF_H_H: addi.w t2, a2, 0 addi.d t0, a0, -4 addi.d t1, a1, -2 .LBS5SGF_H_W: vld vr0, t0, 0 // AA[i] vld vr1, t0, 16 vld vr2, t1, 0 // BB[i] vmul.w vr4, vr0, vr19 // a * n vmul.w vr5, vr1, vr19 // a * n vsllwil.w.h vr9, vr2, 0 vexth.w.h vr10, vr2 vmsub.w vr4, vr9, vr9 // p vmsub.w vr5, vr10, vr10 // p vmaxi.w vr4, vr4, 0 vmaxi.w vr5, vr5, 0 // p vmul.w vr4, vr4, vr6 // p * s vmul.w vr5, vr5, vr6 // p * s vsrlri.w vr4, vr4, 20 vsrlri.w vr5, vr5, 20 // z vmin.w vr4, vr4, vr22 vmin.w vr5, vr5, vr22 // load table data vpickve2gr.w t6, vr4, 0 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 0 vpickve2gr.w t6, vr4, 1 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 1 vpickve2gr.w t6, vr4, 2 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 2 vpickve2gr.w t6, vr4, 3 ldx.bu t7, t8, t6 vinsgr2vr.w vr7, t7, 3 vpickve2gr.w t6, vr5, 0 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 0 vpickve2gr.w t6, vr5, 1 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 1 vpickve2gr.w t6, vr5, 2 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 2 vpickve2gr.w t6, vr5, 3 ldx.bu t7, t8, t6 vinsgr2vr.w vr8, t7, 3 // x vmul.w vr9, vr7, vr9 // x * BB[i] vmul.w vr10, vr8, vr10 vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x vmul.w vr10, vr10, vr20 vsrlri.w vr9, vr9, 12 vsrlri.w vr10, vr10, 12 vsub.w vr7, vr21, vr7 vsub.w vr8, vr21, vr8 vpickev.h vr8, vr8, vr7 vst vr9, t0, 0 vst vr10, t0, 16 vst vr8, t1, 0 addi.d t0, t0, 32 addi.d t1, t1, 16 addi.w t2, t2, -8 blt zero, t2, .LBS5SGF_H_W addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a0, a0, REST_UNIT_STRIDE<<2 addi.d a1, a1, REST_UNIT_STRIDE<<2 addi.w a3, a3, -2 blt zero, a3, .LBS5SGF_H_H endfunc /* selfguided_filter(coef *dst, pixel *src, int32_t *sumsq, coef *sum, const int w, const int h) */ function boxsum5_sgf_v_8bpc_lsx addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B addi.w a5, a5, -1 vldi vr10, 0x806 vldi vr11, 0x805 vldi vr22, 0x406 .LBS5SGF_V_H: addi.d t0, a0, 0 addi.d t1, a1, 0 addi.d t2, a2, 0 addi.d t3, a3, 0 addi.w t4, a4, 0 addi.d t5, a0, 384*2 addi.d t6, a1, REST_UNIT_STRIDE addi.d t7, a2, REST_UNIT_STRIDE<<2 addi.d t8, a3, REST_UNIT_STRIDE<<1 // B .LBS5SGF_V_W: // a vld vr0, t3, -REST_UNIT_STRIDE*2 vld vr1, t3, REST_UNIT_STRIDE*2 vld vr2, t3, (-REST_UNIT_STRIDE-1)*2 vld vr3, t3, (REST_UNIT_STRIDE-1)*2 vld vr4, t3, (1-REST_UNIT_STRIDE)*2 vld vr5, t3, (1+REST_UNIT_STRIDE)*2 vaddwev.w.h vr6, vr0, vr1 vaddwod.w.h vr7, vr0, vr1 vmul.w vr6, vr6, vr10 vmul.w vr7, vr7, vr10 vaddwev.w.h vr8, vr2, vr3 vaddwod.w.h vr9, vr2, vr3 vaddwev.w.h vr12, vr4, vr5 vaddwod.w.h vr13, vr4, vr5 vadd.w vr8, vr8, vr12 vadd.w vr9, vr9, vr13 vmadd.w vr6, vr8, vr11 vmadd.w vr7, vr9, vr11 vilvl.w vr18, vr7, vr6 vilvh.w vr19, vr7, vr6 // b vld vr0, t2, -REST_UNIT_STRIDE*4 vld vr1, t2, -REST_UNIT_STRIDE*4+16 vld vr2, t2, REST_UNIT_STRIDE*4 vld vr3, t2, REST_UNIT_STRIDE*4+16 vld vr4, t2, (-REST_UNIT_STRIDE-1)*4 vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16 vld vr8, t2, (REST_UNIT_STRIDE-1)*4 vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16 vld vr12, t2, (1-REST_UNIT_STRIDE)*4 vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16 vld vr14, t2, (1+REST_UNIT_STRIDE)*4 vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16 vadd.w vr0, vr0, vr2 // 0 1 2 3 vadd.w vr1, vr1, vr3 // 4 5 6 7 vmul.w vr20, vr0, vr10 vmul.w vr21, vr1, vr10 vadd.w vr4, vr4, vr8 // 0 1 2 3 vadd.w vr5, vr5, vr9 // 4 5 6 7 vadd.w vr12, vr12, vr14 vadd.w vr13, vr13, vr15 vadd.w vr12, vr12, vr4 vadd.w vr13, vr13, vr5 vmadd.w vr20, vr12, vr11 vmadd.w vr21, vr13, vr11 vld vr2, t1, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.wu.hu vr3, vr2, 0 vexth.wu.hu vr4, vr2 vmadd.w vr20, vr18, vr3 vmadd.w vr21, vr19, vr4 vssrlrni.h.w vr21, vr20, 9 vst vr21, t0, 0 addi.d t1, t1, 8 addi.d t2, t2, 32 addi.d t3, t3, 16 // a vld vr0, t8, 0 vld vr1, t8, -2 vld vr2, t8, 2 vmulwev.w.h vr3, vr0, vr22 vmulwod.w.h vr4, vr0, vr22 vaddwev.w.h vr5, vr1, vr2 vaddwod.w.h vr6, vr1, vr2 vmadd.w vr3, vr5, vr11 vmadd.w vr4, vr6, vr11 vilvl.w vr19, vr4, vr3 vilvh.w vr20, vr4, vr3 // b vld vr0, t7, 0 vld vr1, t7, -4 vld vr2, t7, 4 vld vr5, t7, 16 vld vr6, t7, 12 vld vr7, t7, 20 vmul.w vr8, vr0, vr10 vmul.w vr9, vr5, vr10 vadd.w vr12, vr1, vr2 vadd.w vr13, vr6, vr7 vmadd.w vr8, vr12, vr11 vmadd.w vr9, vr13, vr11 vld vr2, t6, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.wu.hu vr3, vr2, 0 vexth.wu.hu vr4, vr2 vmadd.w vr8, vr19, vr3 vmadd.w vr9, vr20, vr4 vssrlrni.h.w vr9, vr8, 8 vst vr9, t0, 384*2 addi.d t0, t0, 16 addi.d t8, t8, 16 addi.d t7, t7, 32 addi.d t6, t6, 8 addi.w t4, t4, -8 blt zero, t4, .LBS5SGF_V_W addi.w a5, a5, -2 addi.d a0, a0, 384*4 // dst addi.d a1, a1, REST_UNIT_STRIDE<<1 // src addi.d a2, a2, REST_UNIT_STRIDE<<2 // addi.d a2, a2, REST_UNIT_STRIDE<<2 addi.d a3, a3, REST_UNIT_STRIDE<<2 // blt zero, a5, .LBS5SGF_V_H bnez a5, .LBS5SGF_END .LBS5SGF_V_W1: // a vld vr0, a3, -REST_UNIT_STRIDE*2 vld vr1, a3, REST_UNIT_STRIDE*2 vld vr2, a3, (-REST_UNIT_STRIDE-1)*2 vld vr3, a3, (REST_UNIT_STRIDE-1)*2 vld vr4, a3, (1-REST_UNIT_STRIDE)*2 vld vr5, a3, (1+REST_UNIT_STRIDE)*2 vaddwev.w.h vr6, vr0, vr1 vaddwod.w.h vr7, vr0, vr1 vmul.w vr6, vr6, vr10 vmul.w vr7, vr7, vr10 vaddwev.w.h vr8, vr2, vr3 vaddwod.w.h vr9, vr2, vr3 vaddwev.w.h vr12, vr4, vr5 vaddwod.w.h vr13, vr4, vr5 vadd.w vr8, vr8, vr12 vadd.w vr9, vr9, vr13 vmadd.w vr6, vr8, vr11 vmadd.w vr7, vr9, vr11 vilvl.w vr18, vr7, vr6 vilvh.w vr19, vr7, vr6 // b vld vr0, a2, -REST_UNIT_STRIDE*4 vld vr1, a2, -REST_UNIT_STRIDE*4+16 vld vr2, a2, REST_UNIT_STRIDE*4 vld vr3, a2, REST_UNIT_STRIDE*4+16 vld vr4, a2, (-REST_UNIT_STRIDE-1)*4 vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16 vld vr8, a2, (REST_UNIT_STRIDE-1)*4 vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16 vld vr12, a2, (1-REST_UNIT_STRIDE)*4 vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16 vld vr14, a2, (1+REST_UNIT_STRIDE)*4 vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16 vadd.w vr0, vr0, vr2 // 0 1 2 3 vadd.w vr1, vr1, vr3 // 4 5 6 7 vmul.w vr20, vr0, vr10 vmul.w vr21, vr1, vr10 vadd.w vr4, vr4, vr8 // 0 1 2 3 vadd.w vr5, vr5, vr9 // 4 5 6 7 vadd.w vr12, vr12, vr14 vadd.w vr13, vr13, vr15 vadd.w vr12, vr12, vr4 vadd.w vr13, vr13, vr5 vmadd.w vr20, vr12, vr11 vmadd.w vr21, vr13, vr11 vld vr2, a1, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.wu.hu vr3, vr2, 0 vexth.wu.hu vr4, vr2 vmadd.w vr20, vr18, vr3 vmadd.w vr21, vr19, vr4 vssrlrni.h.w vr21, vr20, 9 vst vr21, a0, 0 addi.d a3, a3, 16 addi.d a2, a2, 32 addi.d a1, a1, 8 addi.d a0, a0, 16 addi.w a4, a4, -8 blt zero, a4, .LBS5SGF_V_W1 .LBS5SGF_END: endfunc /* void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride, const int16_t *dst0, const int16_t *dst1, const int w0, const int w1, const int w, const int h); */ function sgr_mix_finish_8bpc_lsx vreplgr2vr.w vr3, a4 // w0 vreplgr2vr.w vr13, a5 // w1 andi t4, a6, 0x7 sub.w t5, a6, t4 beq zero, t5, .LSGRMIX_REM .LSGRMIX_H: addi.d t0, a0, 0 addi.d t1, a2, 0 // dst0 addi.d t3, a3, 0 // dst1 addi.w t2, t5, 0 andi t4, a6, 0x7 .LSGRMIX_W: vld vr0, t0, 0 vld vr1, t1, 0 vld vr10, t3, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3 vexth.wu.hu vr5, vr2 // u 4 5 6 7 vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst0 vexth.w.h vr9, vr1 // dst0 vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vsllwil.w.h vr11, vr10, 0 // dst1 vexth.w.h vr12, vr10 // dst1 vsub.w vr11, vr11, vr4 vsub.w vr12, vr12, vr5 vmadd.w vr6, vr11, vr13 vmadd.w vr7, vr12, vr13 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 vstelm.d vr7, t0, 0, 0 addi.d t0, t0, 8 addi.d t1, t1, 16 addi.d t3, t3, 16 addi.d t2, t2, -8 bne zero, t2, .LSGRMIX_W beq t4, zero, .LSGRMIX_W8 vld vr0, t0, 0 vld vr1, t1, 0 vld vr10, t3, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // p vexth.wu.hu vr5, vr2 // p vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst vexth.w.h vr9, vr1 // dst vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vsllwil.w.h vr11, vr10, 0 // dst1 vexth.w.h vr12, vr10 // dst1 vsub.w vr11, vr11, vr4 vsub.w vr12, vr12, vr5 vmadd.w vr6, vr11, vr13 vmadd.w vr7, vr12, vr13 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 .LSGRMIX_ST: vstelm.b vr7, t0, 0, 0 addi.d t0, t0, 1 vbsrl.v vr7, vr7, 1 addi.w t4, t4, -1 bnez t4, .LSGRMIX_ST .LSGRMIX_W8: addi.w a7, a7, -1 add.d a0, a0, a1 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) addi.d a3, a3, (FILTER_OUT_STRIDE<<1) bnez a7, .LSGRMIX_H b .LSGR_MIX_END .LSGRMIX_REM: andi t4, a6, 0x7 vld vr0, a0, 0 vld vr1, a2, 0 vld vr10, a3, 0 vsllwil.hu.bu vr2, vr0, 4 // u 8 h vsllwil.wu.hu vr4, vr2, 0 // p vexth.wu.hu vr5, vr2 // p vslli.w vr6, vr4, 7 vslli.w vr7, vr5, 7 vsllwil.w.h vr8, vr1, 0 // dst vexth.w.h vr9, vr1 // dst vsub.w vr8, vr8, vr4 vsub.w vr9, vr9, vr5 vmadd.w vr6, vr8, vr3 // v 0 - 3 vmadd.w vr7, vr9, vr3 // v 4 - 7 vsllwil.w.h vr11, vr10, 0 // dst1 vexth.w.h vr12, vr10 // dst1 vsub.w vr11, vr11, vr4 vsub.w vr12, vr12, vr5 vmadd.w vr6, vr11, vr13 vmadd.w vr7, vr12, vr13 vssrarni.hu.w vr7, vr6, 11 vssrlni.bu.h vr7, vr7, 0 addi.d t0, a0, 0 .LSGRMIX_REM_ST: vstelm.b vr7, t0, 0, 0 addi.d t0, t0, 1 vbsrl.v vr7, vr7, 1 addi.w t4, t4, -1 bnez t4, .LSGRMIX_REM_ST addi.w a7, a7, -1 add.d a0, a0, a1 addi.d a2, a2, (FILTER_OUT_STRIDE<<1) addi.d a3, a3, (FILTER_OUT_STRIDE<<1) bnez a7, .LSGRMIX_REM .LSGR_MIX_END: endfunc