/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" /* void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride, coef *const coeff, const int eob HIGHBD_DECL_SUFFIX) */ function inv_txfm_add_wht_wht_4x4_8bpc_lsx vld vr0, a2, 0 vld vr2, a2, 16 vreplgr2vr.h vr20, zero vsrai.h vr0, vr0, 2 vsrai.h vr2, vr2, 2 vst vr20, a2, 0 vpickod.d vr1, vr0, vr0 vpickod.d vr3, vr2, vr2 vadd.h vr4, vr0, vr1 vsub.h vr5, vr2, vr3 vsub.h vr6, vr4, vr5 vsrai.h vr6, vr6, 1 vsub.h vr0, vr6, vr3 vsub.h vr2, vr6, vr1 vsub.h vr1, vr4, vr0 vadd.h vr3, vr5, vr2 vst vr20, a2, 16 vilvl.h vr4, vr0, vr1 vilvl.h vr5, vr3, vr2 vilvl.w vr0, vr5, vr4 vilvh.w vr2, vr5, vr4 vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vadd.h vr4, vr0, vr1 vsub.h vr5, vr2, vr3 vsub.h vr6, vr4, vr5 vsrai.h vr6, vr6, 1 vsub.h vr0, vr6, vr3 vsub.h vr2, vr6, vr1 vsub.h vr1, vr4, vr0 vadd.h vr3, vr5, vr2 vld vr4, a0, 0 vldx vr5, a0, a1 alsl.d t0, a1, a0, 1 vld vr6, t0, 0 vldx vr7, t0, a1 vsllwil.hu.bu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vsllwil.hu.bu vr7, vr7, 0 vilvl.d vr1, vr0, vr1 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vadd.h vr1, vr1, vr4 vadd.h vr2, vr2, vr6 vssrani.bu.h vr2, vr1, 0 vstelm.w vr2, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 1 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 2 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 3 endfunc const idct_coeffs, align=4 // idct4 .word 2896, 2896*8, 1567, 3784 // idct8 .word 799, 4017, 3406, 2276 // idct16 .word 401, 4076, 3166, 2598 .word 1931, 3612, 3920, 1189 // idct32 .word 201, 4091, 3035, 2751 .word 1751, 3703, 3857, 1380 .word 995, 3973, 3513, 2106 .word 2440, 3290, 4052, 601 endconst .macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 vld \in0, \src, \start vld \in1, \src, \start+(\stride*1) vld \in2, \src, \start+(\stride*2) vld \in3, \src, \start+(\stride*3) vld \in4, \src, \start+(\stride*4) vld \in5, \src, \start+(\stride*5) vld \in6, \src, \start+(\stride*6) vld \in7, \src, \start+(\stride*7) .endm .macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 vst \in0, \src, \start vst \in1, \src, \start+(\stride*1) vst \in2, \src, \start+(\stride*2) vst \in3, \src, \start+(\stride*3) vst \in4, \src, \start+(\stride*4) vst \in5, \src, \start+(\stride*5) vst \in6, \src, \start+(\stride*6) vst \in7, \src, \start+(\stride*7) .endm .macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vld \in8, \src, \start+(\stride*8) vld \in9, \src, \start+(\stride*9) vld \in10, \src, \start+(\stride*10) vld \in11, \src, \start+(\stride*11) vld \in12, \src, \start+(\stride*12) vld \in13, \src, \start+(\stride*13) vld \in14, \src, \start+(\stride*14) vld \in15, \src, \start+(\stride*15) .endm .macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vst \in8, \src, \start+(\stride*8) vst \in9, \src, \start+(\stride*9) vst \in10, \src, \start+(\stride*10) vst \in11, \src, \start+(\stride*11) vst \in12, \src, \start+(\stride*12) vst \in13, \src, \start+(\stride*13) vst \in14, \src, \start+(\stride*14) vst \in15, \src, \start+(\stride*15) .endm .macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... vsllwil.hu.bu vr10, vr10, 0 vsllwil.hu.bu vr12, vr12, 0 vadd.h vr10, \in4, vr10 vadd.h vr12, \in5, vr12 vssrani.bu.h vr12, vr10, 0 vstelm.w vr12, a0, 0, 0 add.d t8, a0, a1 vstelm.w vr12, t8, 0, 1 vstelm.w vr12, t2, 0, 2 add.d t8, t2, a1 vstelm.w vr12, t8, 0, 3 .endm .macro VLD_DST_ADD_W4 in0, in1 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 .endm .macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1 vexth.w.h vr4, \in0 // in1 vexth.w.h vr5, \in1 // in3 vmul.w vr6, vr4, \in4 vmul.w vr7, vr4, \in5 vmadd.w vr6, vr5, \in5 // t3 vmsub.w vr7, vr5, \in4 // t2 vsllwil.w.h vr4, \in2, 0 // in0 vsllwil.w.h vr5, \in3, 0 // in2 vmul.w vr9, vr4, \in6 vmul.w vr10, vr4, \in7 vmadd.w vr9, vr5, \in7 // t0 vmsub.w vr10, vr5, \in6 // t1 vssrarni.h.w vr10, vr9, 12 // t0 t1 vssrarni.h.w vr7, vr6, 12 // t3 t2 vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1] vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2] .endm .macro inv_dct_dct_4x4_lsx la.local t0, idct_coeffs vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 vldrepl.w vr2, t0, 8 // 1567 vldrepl.w vr3, t0, 12 // 3784 vldrepl.w vr8, t0, 0 // 2896 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 vreplgr2vr.h vr15, zero vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 vst vr15, a2, 0 vst vr15, a2, 16 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 vshuf4i.d vr14, vr14, 0x01 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro identity_4x4_lsx in0, in1, in2, in3, out0 vsllwil.w.h vr2, \in0, 0 vexth.w.h vr3, \in1 vmul.w vr4, vr2, \in2 vmul.w vr5, vr3, \in2 vssrarni.h.w vr5, vr4, 12 vsadd.h \out0, vr5, \in3 .endm .macro inv_identity_identity_4x4_lsx vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 identity_4x4_lsx vr0, vr0, vr20, vr0, vr6 identity_4x4_lsx vr1, vr1, vr20, vr1, vr7 vsrari.h vr6, vr6, 4 vsrari.h vr7, vr7, 4 vilvh.d vr8, vr6, vr6 vilvh.d vr9, vr7, vr7 vilvl.h vr4, vr8, vr6 vilvl.h vr5, vr9, vr7 vilvl.w vr6, vr5, vr4 vilvh.w vr7, vr5, vr4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr6, vr7 .endm const iadst4_coeffs, align=4 .word 1321, 3803, 2482, 3344 endconst .macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3 vsub.w vr6, \in0, \in2 // in0-in2 vmul.w vr7, \in0, vr20 // in0*1321 vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803 vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482 vmul.w vr8, \in1, vr23 // in1*3344 vadd.w vr6, vr6, \in3 // in0-in2+in3 vmul.w vr9, \in0, vr22 // in0*2482 vmsub.w vr9, \in2, vr20 // in2*1321 vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803 vadd.w vr5, vr7, vr9 vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11 vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3 vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7 vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15 .endm .macro inv_adst_dct_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 vssrarni.h.w vr13, vr11, 12 vssrarni.h.w vr14, vr12, 12 vreplgr2vr.h vr15, zero la.local t0, idct_coeffs vst vr15, a2, 0 vst vr15, a2, 16 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 vshuf4i.d vr14, vr14, 0x01 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_adst_adst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 vsrari.w vr11, vr11, 12 vsrari.w vr13, vr13, 12 vsrari.w vr12, vr12, 12 vsrari.w vr14, vr14, 12 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14 vssrarni.h.w vr13, vr11, 12 vssrarni.h.w vr14, vr12, 12 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_dct_adst_4x4_lsx la.local t0, idct_coeffs vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 vsllwil.w.h vr2, vr11, 0 // in0 vexth.w.h vr3, vr11 // in1 vsllwil.w.h vr4, vr12, 0 // in2 vexth.w.h vr5, vr12 // in3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14 vssrarni.h.w vr13, vr11, 12 vssrarni.h.w vr14, vr12, 12 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_dct_flipadst_4x4_lsx la.local t0, idct_coeffs vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 vsllwil.w.h vr2, vr11, 0 // in0 vexth.w.h vr3, vr11 // in1 vsllwil.w.h vr4, vr12, 0 // in2 vexth.w.h vr5, vr12 // in3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14 vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7 vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15 vsrari.h vr11, vr11, 4 vsrari.h vr13, vr13, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr11 .endm .macro inv_flipadst_adst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 vsrari.w vr0, vr0, 12 vsrari.w vr1, vr1, 12 vsrari.w vr2, vr2, 12 vsrari.w vr3, vr3, 12 vilvl.w vr4, vr0, vr1 vilvh.w vr5, vr0, vr1 vilvl.w vr6, vr2, vr3 vilvh.w vr7, vr2, vr3 vilvl.d vr11, vr4, vr6 vilvh.d vr12, vr4, vr6 vilvl.d vr13, vr5, vr7 vilvh.d vr14, vr5, vr7 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14 vssrarni.h.w vr13, vr11, 12 vssrarni.h.w vr14, vr12, 12 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_adst_flipadst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 vsrari.w vr11, vr11, 12 vsrari.w vr12, vr12, 12 vsrari.w vr13, vr13, 12 vsrari.w vr14, vr14, 12 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14 vssrarni.h.w vr11, vr12, 12 vssrarni.h.w vr13, vr14, 12 vsrari.h vr11, vr11, 4 vsrari.h vr13, vr13, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr11 .endm .macro inv_flipadst_dct_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 vilvl.w vr4, vr0, vr1 vilvh.w vr5, vr0, vr1 vilvl.w vr6, vr2, vr3 vilvh.w vr7, vr2, vr3 vilvl.d vr11, vr4, vr6 vilvh.d vr12, vr4, vr6 vilvl.d vr13, vr5, vr7 vilvh.d vr14, vr5, vr7 vssrarni.h.w vr12, vr11, 12 vssrarni.h.w vr14, vr13, 12 vreplgr2vr.h vr15, zero la.local t0, idct_coeffs vst vr15, a2, 0 vst vr15, a2, 16 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14 vshuf4i.d vr14, vr14, 0x01 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_flipadst_flipadst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 vilvl.w vr4, vr0, vr1 vilvh.w vr5, vr0, vr1 vilvl.w vr6, vr2, vr3 vilvh.w vr7, vr2, vr3 vilvl.d vr11, vr4, vr6 vilvh.d vr12, vr4, vr6 vilvl.d vr13, vr5, vr7 vilvh.d vr14, vr5, vr7 vsrari.w vr11, vr11, 12 vsrari.w vr12, vr12, 12 vsrari.w vr13, vr13, 12 vsrari.w vr14, vr14, 12 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14 vssrarni.h.w vr11, vr12, 12 vssrarni.h.w vr13, vr14, 12 vsrari.h vr11, vr11, 4 vsrari.h vr13, vr13, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr11 .endm .macro inv_dct_identity_4x4_lsx la.local t0, idct_coeffs vld vr0, a2, 0 vld vr1, a2, 16 vldrepl.w vr2, t0, 8 // 1567 vldrepl.w vr3, t0, 12 // 3784 vldrepl.w vr8, t0, 0 // 2896 dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 vreplgr2vr.h vr15, zero li.w t0, 1697 vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 vst vr15, a2, 0 vst vr15, a2, 16 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr10, vr10, vr20, vr10, vr6 identity_4x4_lsx vr12, vr12, vr20, vr12, vr7 vsrari.h vr11, vr6, 4 vsrari.h vr13, vr7, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr11, vr13 .endm .macro inv_identity_dct_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 vreplgr2vr.h vr15, zero vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15 vst vr15, a2, 0 vst vr15, a2, 16 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 vshuf4i.d vr14, vr14, 0x01 vsrari.h vr13, vr13, 4 vsrari.h vr14, vr14, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr14 .endm .macro inv_flipadst_identity_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13 vssrarni.h.w vr12, vr13, 12 vssrarni.h.w vr10, vr11, 12 vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15 vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15 vreplgr2vr.h vr15, zero li.w t0, 1697 vst vr15, a2, 0 vst vr15, a2, 16 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr11, vr11, vr20, vr11, vr6 identity_4x4_lsx vr13, vr13, vr20, vr13, vr7 vsrari.h vr11, vr6, 4 vsrari.h vr13, vr7, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr11, vr13 .endm .macro inv_identity_flipadst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr11, vr5, vr4 vilvh.h vr13, vr5, vr4 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr11, 0 // in0 vexth.w.h vr3, vr11 // in1 vsllwil.w.h vr4, vr13, 0 // in2 vexth.w.h vr5, vr13 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15 vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7 vsrari.h vr11, vr0, 4 vsrari.h vr13, vr2, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr13, vr11 .endm .macro inv_identity_adst_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr11, vr5, vr4 vilvh.h vr13, vr5, vr4 vreplgr2vr.h vr15, zero vst vr15, a2, 0 vst vr15, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr11, 0 // in0 vexth.w.h vr3, vr11 // in1 vsllwil.w.h vr4, vr13, 0 // in2 vexth.w.h vr5, vr13 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 vssrarni.h.w vr1, vr0, 12 vssrarni.h.w vr3, vr2, 12 vsrari.h vr11, vr1, 4 vsrari.h vr13, vr3, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr11, vr13 .endm .macro inv_adst_identity_4x4_lsx vld vr0, a2, 0 vld vr1, a2, 16 la.local t0, iadst4_coeffs vsllwil.w.h vr2, vr0, 0 // in0 vexth.w.h vr3, vr0 // in1 vsllwil.w.h vr4, vr1, 0 // in2 vexth.w.h vr5, vr1 // in3 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 vssrarni.h.w vr13, vr11, 12 vssrarni.h.w vr14, vr12, 12 vreplgr2vr.h vr15, zero li.w t0, 1697 vst vr15, a2, 0 vst vr15, a2, 16 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr13, vr13, vr20, vr13, vr6 identity_4x4_lsx vr14, vr14, vr20, vr14, vr7 vsrari.h vr11, vr6, 4 vsrari.h vr13, vr7, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr11, vr13 .endm .macro fun4x4 type1, type2 function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx .ifc \type1\()_\type2, dct_dct bnez a3, .LLL vldi vr0, 0x8b5 // 181 ld.h t2, a2, 0 // dc st.h zero, a2, 0 vreplgr2vr.w vr1, t2 vldi vr3, 0x880 // 128 vmul.w vr2, vr0, vr1 vld vr10, a0, 0 vsrari.w vr2, vr2, 8 vldx vr11, a0, a1 vmadd.w vr3, vr2, vr0 alsl.d t2, a1, a0, 1 vssrarni.h.w vr3, vr3, 12 vld vr12, t2, 0 vldx vr13, t2, a1 DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3 b .IDST_\type1\()_\type2\()_4X4_END .LLL: .endif inv_\type1\()_\type2\()_4x4_lsx .IDST_\type1\()_\type2\()_4X4_END: endfunc .endm fun4x4 dct, dct fun4x4 identity, identity fun4x4 adst, dct fun4x4 dct, adst fun4x4 adst, adst fun4x4 dct, flipadst fun4x4 flipadst, adst fun4x4 adst, flipadst fun4x4 flipadst, dct fun4x4 flipadst, flipadst fun4x4 dct, identity fun4x4 identity, dct fun4x4 flipadst, identity fun4x4 identity, flipadst fun4x4 identity, adst fun4x4 adst, identity function inv_txfm_add_dct_dct_4x8_8bpc_lsx bnez a3, .NO_HAS_DCONLY_4x8 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 vld vr10, a0, 0 vmul.w vr2, vr2, vr0 vldx vr11, a0, a1 vsrari.w vr2, vr2, 8 alsl.d t2, a1, a0, 1 vmadd.w vr5, vr2, vr0 vld vr12, t2, 0 vssrarni.h.w vr5, vr5, 12 vldx vr13, t2, a1 DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 VLD_DST_ADD_W4 vr5, vr5 b .DCT_DCT_4x8_END .NO_HAS_DCONLY_4x8: // sh=8 sw=4 la.local t0, idct_coeffs vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3 vldrepl.w vr2, t0, 8 // 1567 vldrepl.w vr3, t0, 12 // 3784 vldrepl.w vr8, t0, 0 // 2896 .macro DCT4_4Wx8H_1D_LSX // in1 in3 vsllwil.w.h vr4, vr1, 0 // in1 vsllwil.w.h vr5, vr21, 0 // in3 vmul.w vr4, vr4, vr8 vmul.w vr5, vr5, vr8 vsrari.w vr4, vr4, 12 vsrari.w vr5, vr5, 12 vmul.w vr6, vr4, vr3 vmul.w vr7, vr4, vr2 vmadd.w vr6, vr5, vr2 // t3 0 1 2 3 vmsub.w vr7, vr5, vr3 // t2 0 1 2 3 vexth.w.h vr4, vr1 // in1 vexth.w.h vr5, vr21 // in3 vmul.w vr4, vr4, vr8 vmul.w vr5, vr5, vr8 vsrari.w vr4, vr4, 12 vsrari.w vr5, vr5, 12 vmul.w vr9, vr4, vr3 vmul.w vr10, vr4, vr2 vmadd.w vr9, vr5, vr2 // t3 4 5 6 7 vmsub.w vr10, vr5, vr3 // t2 4 5 6 7 // in0 in2 vsllwil.w.h vr4, vr0, 0 // in0 vsllwil.w.h vr5, vr20, 0 // in2 vmul.w vr4, vr4, vr8 vmul.w vr5, vr5, vr8 vsrari.w vr4, vr4, 12 vsrari.w vr5, vr5, 12 vmul.w vr11, vr4, vr8 vmul.w vr12, vr4, vr8 vmadd.w vr11, vr5, vr8 // t0 0 1 2 3 vmsub.w vr12, vr5, vr8 // t1 0 1 2 3 vexth.w.h vr4, vr0 // in0 vexth.w.h vr5, vr20 // in2 vmul.w vr4, vr4, vr8 vmul.w vr5, vr5, vr8 vsrari.w vr4, vr4, 12 vsrari.w vr5, vr5, 12 vmul.w vr13, vr4, vr8 vmul.w vr14, vr4, vr8 vmadd.w vr13, vr5, vr8 // t0 4 5 6 7 vmsub.w vr14, vr5, vr8 // t1 4 5 6 7 vssrarni.h.w vr9, vr6, 12 // t3 vssrarni.h.w vr10, vr7, 12 // t2 vssrarni.h.w vr14, vr12, 12 // t1 vssrarni.h.w vr13, vr11, 12 // t0 vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28 vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29 vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30 vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31 .endm DCT4_4Wx8H_1D_LSX vreplgr2vr.h vr22, zero vst vr22, a2, 0 vst vr22, a2, 16 vst vr22, a2, 32 vst vr22, a2, 48 vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13 vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15 vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29 vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31 vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0 vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1 vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2 vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3 vilvl.d vr0, vr10, vr9 vilvl.d vr1, vr12, vr11 vilvh.d vr20, vr9, vr11 // in5 in1 vilvh.d vr21, vr12, vr10 // in3 in7 .macro DCT8_4Wx8H_1D_LSX dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 vldrepl.w vr17, t0, 16 // 799 vldrepl.w vr18, t0, 20 // 4017 vldrepl.w vr11, t0, 24 // 3406 vldrepl.w vr12, t0, 28 // 2276 vexth.w.h vr4, vr20 vexth.w.h vr5, vr21 vmul.w vr6, vr4, vr18 // in1 * 4017 vmul.w vr7, vr4, vr17 // in1 * 799 vmadd.w vr6, vr5, vr17 // in7 * 799 vmsub.w vr7, vr5, vr18 // in7 * 4017 vsllwil.w.h vr4, vr20, 0 vsllwil.w.h vr5, vr21, 0 vmul.w vr9, vr4, vr12 vmul.w vr10, vr4, vr11 vmadd.w vr9, vr5, vr11 vmsub.w vr10, vr5, vr12 vssrarni.h.w vr10, vr9, 12 // t6a t5a vssrarni.h.w vr7, vr6, 12 // t7a t4a vsadd.h vr15, vr7, vr10 // t7 t4 vssub.h vr16, vr7, vr10 // t6a t5a vexth.w.h vr4, vr16 // t5a vsllwil.w.h vr5, vr16, 0 // t6a vldi vr2, 0x8b5 // 181 vsub.w vr6, vr5, vr4 vadd.w vr7, vr5, vr4 vmul.w vr6, vr6, vr2 vmul.w vr7, vr7, vr2 vssrarni.h.w vr7, vr6, 8 // t5 t6 vaddi.hu vr18, vr7, 0 vshuf4i.d vr7, vr15, 0x06 // t7 t6 vshuf4i.d vr15, vr18, 0x09 // t4 t5 // vr17 -> vr7 vr18 -> vr15 vsadd.h vr4, vr13, vr7 vsadd.h vr5, vr14, vr15 vssub.h vr6, vr14, vr15 vssub.h vr7, vr13, vr7 .endm DCT8_4Wx8H_1D_LSX vshuf4i.d vr5, vr5, 0x01 vshuf4i.d vr7, vr7, 0x01 vsrari.h vr4, vr4, 4 vsrari.h vr5, vr5, 4 vsrari.h vr6, vr6, 4 vsrari.h vr7, vr7, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr4, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 VLD_DST_ADD_W4 vr6, vr7 .DCT_DCT_4x8_END: endfunc .macro rect2_w4_lsx in0, in1, in2, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in1 vmul.w vr22, vr22, \in2 vmul.w vr23, vr23, \in2 vsrari.w \out0, vr22, 12 vsrari.w \out1, vr23, 12 .endm .macro dct_8x4_core_lsx1 out0, out1, out2, out3 // dct4 stride=1<<1 vmul.w vr0, vr6, vr21 vmul.w vr1, vr6, vr20 vmadd.w vr0, vr10, vr20 // t3 vmsub.w vr1, vr10, vr21 // t2 vmul.w vr2, vr18, vr22 vmul.w vr3, vr18, vr22 vmadd.w vr2, vr8, vr22 // t0 vmsub.w vr3, vr8, vr22 // t1 vssrarni.h.w vr1, vr0, 12 // t3 t2 vssrarni.h.w vr3, vr2, 12 // t0 t1 vsadd.h vr8, vr3, vr1 // t0 t1 vssub.h vr10, vr3, vr1 // t3 t2 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vldrepl.w vr22, t0, 24 // 3406 vldrepl.w vr23, t0, 28 // 2276 vmul.w vr0, vr19, vr21 // in1 * 4017 vmul.w vr1, vr19, vr20 // in1 * 799 vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a vmul.w vr2, vr9, vr23 // in5 * 1138 vmul.w vr3, vr9, vr22 // in5 * 1703 vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a vssrarni.h.w vr0, vr1, 12 // t4a t7a vssrarni.h.w vr2, vr3, 12 // t5a t6a vsadd.h vr9, vr0, vr2 // t4 t7 vssub.h vr11, vr0, vr2 // t5a t6a vldrepl.w vr22, t0, 0 // 2896 vexth.w.h vr18, vr11 // t6a vsllwil.w.h vr19, vr11, 0 // t5a vmul.w vr6, vr18, vr22 vmul.w vr7, vr18, vr22 vmadd.w vr6, vr19, vr22 // t6 vmsub.w vr7, vr19, vr22 // t5 vssrarni.h.w vr6, vr7, 12 // t5 t6 vilvh.d vr11, vr6, vr9 // t7 t6 vilvl.d vr9, vr6, vr9 // t4 t5 vsadd.h \out0, vr8, vr11 // c[0] c[1] vsadd.h \out1, vr10, vr9 // c[3] c[2] vssub.h \out2, vr10, vr9 // c[4] c[5] vssub.h \out3, vr8, vr11 // c[7] c[6] .endm .macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 vexth.w.h vr4, \in0 // in1 vexth.w.h vr5, \in1 // in3 vmul.w vr6, vr4, \in4 vmul.w vr7, vr4, \in5 vmadd.w vr6, vr5, \in5 // t3 vmsub.w vr7, vr5, \in4 // t2 vexth.w.h vr4, \in2 // in1 vexth.w.h vr5, \in3 // in3 vmul.w vr8, vr4, \in4 vmul.w vr9, vr4, \in5 vmadd.w vr8, vr5, \in5 // t3 vmsub.w vr9, vr5, \in4 // t2 vssrarni.h.w vr8, vr6, 12 // t3 vssrarni.h.w vr9, vr7, 12 // t2 vsllwil.w.h vr4, \in0, 0 vsllwil.w.h vr5, \in1, 0 vmul.w vr11, vr4, \in6 vmul.w vr12, vr4, \in7 vmadd.w vr11, vr5, \in7 // t0 vmsub.w vr12, vr5, \in6 // t1 vsllwil.w.h vr4, \in2, 0 vsllwil.w.h vr5, \in3, 0 vmul.w vr13, vr4, \in6 vmul.w vr14, vr4, \in7 vmadd.w vr13, vr5, \in7 // t0 vmsub.w vr14, vr5, \in6 // t1 vssrarni.h.w vr13, vr11, 12 // t0 vssrarni.h.w vr14, vr12, 12 // t1 vsadd.h \out0, vr13, vr8 vsadd.h \out1, vr14, vr9 vssub.h \out2, vr14, vr9 vssub.h \out3, vr13, vr8 .endm .macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr10, \in0, 0 vsllwil.hu.bu vr11, \in1, 0 vsllwil.hu.bu vr12, \in2, 0 vsllwil.hu.bu vr13, \in3, 0 vadd.h vr10, \in4, vr10 vadd.h vr11, \in5, vr11 vadd.h vr12, \in6, vr12 vadd.h vr13, \in7, vr13 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vstelm.d vr11, a0, 0, 0 add.d t8, a0, a1 vstelm.d vr11, t8, 0, 1 vstelm.d vr13, t2, 0, 0 add.d t8, t2, a1 vstelm.d vr13, t8, 0, 1 .endm .macro VLD_DST_ADD_W8 in0, in1, in2, in3 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 .endm function inv_txfm_add_dct_dct_8x4_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x4 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 vld vr10, a0, 0 vmul.w vr2, vr2, vr0 vldx vr11, a0, a1 vsrari.w vr2, vr2, 8 alsl.d t2, a1, a0, 1 vmadd.w vr5, vr2, vr0 vld vr12, t2, 0 vssrarni.h.w vr5, vr5, 12 vldx vr13, t2, a1 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 b .DCT_DCT_8X4_END .NO_HAS_DCONLY_8x4: la.local t0, idct_coeffs vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 vshuf4i.d vr1, vr1, 0x01 vshuf4i.d vr3, vr3, 0x01 vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1 vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 vilvl.h vr2, vr5, vr4 // 16 - 23 in2 vilvh.h vr3, vr5, vr4 // 24 - 31 in3 la.local t0, idct_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ vr22, vr15, vr16, vr17, vr18 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 vsrari.h vr18, vr18, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 .DCT_DCT_8X4_END: endfunc .macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 vssrarni.h.w \in1, \in0, 0 vssrarni.h.w \in3, \in2, 0 vssrarni.h.w \in5, \in4, 0 vssrarni.h.w \in7, \in6, 0 vsadd.h \out0, \in1, \in1 vsadd.h \out1, \in3, \in3 vsadd.h \out2, \in5, \in5 vsadd.h \out3, \in7, \in7 .endm function inv_txfm_add_identity_identity_8x4_8bpc_lsx la.local t0, idct_coeffs vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ vr19, vr7, vr9, vr11 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr19, vr19, vr20, vr19, vr19 identity_4x4_lsx vr7, vr7, vr20, vr7, vr7 identity_4x4_lsx vr9, vr9, vr20, vr9, vr9 identity_4x4_lsx vr11, vr11, vr20, vr11, vr11 vsrari.h vr15, vr19, 4 vsrari.h vr16, vr7, 4 vsrari.h vr17, vr9, 4 vsrari.h vr18, vr11, 4 vilvl.h vr4, vr16, vr15 vilvh.h vr5, vr16, vr15 vilvl.h vr11, vr5, vr4 vilvh.h vr12, vr5, vr4 vilvl.h vr4, vr18, vr17 vilvh.h vr5, vr18, vr17 vilvl.h vr13, vr5, vr4 vilvh.h vr14, vr5, vr4 vilvl.d vr15, vr13, vr11 vilvh.d vr16, vr13, vr11 vilvl.d vr17, vr14, vr12 vilvh.d vr18, vr14, vr12 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 endfunc const iadst8_coeffs, align=4 .word 4076, 401, 3612, 1931 .word 2598, 3166, 1189, 3920 // idct_coeffs .word 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst .macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, out0, out1, out2, out3 vmul.w \out0, \in0, \in4 vmul.w \out1, \in0, \in5 vmadd.w \out0, \in1, \in6 // t0a vmsub.w \out1, \in1, \in7 // t1a vmul.w \out2, \in2, \in8 vmul.w \out3, \in2, \in9 vmadd.w \out2, \in3, \in10 // t2a vmsub.w \out3, \in3, \in11 // t3a vssrarni.h.w \out1, \out0, 12 // t0a t1a vssrarni.h.w \out3, \out2, 12 // t2a t3a .endm .macro adst8x4_1d_lsx la.local t0, iadst8_coeffs vldrepl.w vr20, t0, 0 // 4076 vldrepl.w vr21, t0, 4 // 401 vldrepl.w vr22, t0, 8 // 3612 vldrepl.w vr23, t0, 12 // 1931 // vr13 t0a t1a vr15 t2a t3a vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 vldrepl.w vr20, t0, 16 // 2598 vldrepl.w vr21, t0, 20 // 3166 vldrepl.w vr22, t0, 24 // 1189 vldrepl.w vr23, t0, 28 // 3920 // vr18 t4a t5a vr6 t6a t7a vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 vsadd.h vr12, vr13, vr18 // t0 t1 vsadd.h vr14, vr15, vr6 // t2 t3 vssub.h vr16, vr13, vr18 // t4 t5 vssub.h vr18, vr15, vr6 // t6 t7 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 vsllwil.w.h vr7, vr16, 0 // t4 vexth.w.h vr8, vr16 // t5 vsllwil.w.h vr10, vr18, 0 // t6 vexth.w.h vr11, vr18 // t7 // vr13 out0 out7 vr17 out1 out6 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19 vshuf4i.d vr19, vr19, 0x01 vsadd.h vr13, vr12, vr14 // out0 out7 vssub.h vr16, vr12, vr14 // t2 t3 vsadd.h vr17, vr15, vr19 // out1 out6 vssub.h vr18, vr15, vr19 // t6 t7 vexth.w.h vr20, vr13 // out7 vsllwil.w.h vr21, vr17, 0 // out1 vneg.w vr20, vr20 vneg.w vr21, vr21 vssrarni.h.w vr21, vr20, 0 // out7 out1 vilvl.d vr13, vr21, vr13 // out0 out7 vilvh.d vr17, vr17, vr21 // out1 out6 vsllwil.w.h vr7, vr16, 0 // t2 vexth.w.h vr8, vr16 // t3 vsllwil.w.h vr10, vr18, 0 // t6 vexth.w.h vr11, vr18 // t7 // vr15 out[3] out[4] vr18 out[2] out[5] vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 vexth.w.h vr20, vr18 // out5 vsllwil.w.h vr21, vr15, 0 // out3 vneg.w vr20, vr20 vneg.w vr21, vr21 vssrarni.h.w vr21, vr20, 0 // out5 out3 vilvl.d vr18, vr21, vr18 // out2 out5 vilvh.d vr15, vr15, vr21 // out3 out4 .endm function inv_txfm_add_adst_dct_8x4_8bpc_lsx vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 adst8x4_1d_lsx vilvl.h vr4, vr17, vr13 vilvl.h vr5, vr15, vr18 vilvl.w vr0, vr5, vr4 vilvh.w vr1, vr5, vr4 vilvh.h vr4, vr18, vr15 vilvh.h vr5, vr13, vr17 vilvl.w vr2, vr5, vr4 vilvh.w vr3, vr5, vr4 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ vr22, vr15, vr16, vr17, vr18 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 vsrari.h vr18, vr18, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 endfunc function inv_txfm_add_dct_adst_8x4_8bpc_lsx vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 vshuf4i.d vr1, vr1, 0x01 vshuf4i.d vr3, vr3, 0x01 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr0, vr5, vr4 vilvh.h vr1, vr5, vr4 vilvl.h vr4, vr3, vr2 vilvh.h vr5, vr3, vr2 vilvl.h vr2, vr5, vr4 vilvh.h vr3, vr5, vr4 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 vexth.w.h vr11, vr0 vsllwil.w.h vr12, vr1, 0 vexth.w.h vr13, vr1 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_adst_adst_8x4_8bpc_lsx vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 adst8x4_1d_lsx vilvl.h vr4, vr17, vr13 vilvl.h vr5, vr15, vr18 vilvl.w vr0, vr5, vr4 vilvh.w vr1, vr5, vr4 vilvh.h vr4, vr18, vr15 vilvh.h vr5, vr13, vr17 vilvl.w vr2, vr5, vr4 vilvh.w vr3, vr5, vr4 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 vexth.w.h vr11, vr0 vsllwil.w.h vr12, vr1, 0 vexth.w.h vr13, vr1 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 adst8x4_1d_lsx vilvl.h vr20, vr15, vr13 vilvl.h vr21, vr18, vr17 vilvl.w vr0, vr21, vr20 vilvh.w vr1, vr21, vr20 vilvh.h vr20, vr15, vr13 vilvh.h vr21, vr18, vr17 vilvl.w vr2, vr21, vr20 vilvh.w vr3, vr21, vr20 vshuf4i.h vr0, vr0, 0x2d vshuf4i.h vr1, vr1, 0x2d vshuf4i.h vr2, vr2, 0x78 vshuf4i.h vr3, vr3, 0x78 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr2, 0 vexth.w.h vr11, vr2 vsllwil.w.h vr12, vr3, 0 vexth.w.h vr13, vr3 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr0, 0 vexth.w.h vr15, vr0 vsllwil.w.h vr16, vr1, 0 vexth.w.h vr17, vr1 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 adst8x4_1d_lsx vilvl.h vr4, vr17, vr13 vilvl.h vr5, vr15, vr18 vilvl.w vr0, vr5, vr4 vilvh.w vr1, vr5, vr4 vilvh.h vr4, vr18, vr15 vilvh.h vr5, vr13, vr17 vilvl.w vr2, vr5, vr4 vilvh.w vr3, vr5, vr4 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 vexth.w.h vr11, vr0 vsllwil.w.h vr12, vr1, 0 vexth.w.h vr13, vr1 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 endfunc function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 adst8x4_1d_lsx vilvl.h vr20, vr15, vr13 vilvl.h vr21, vr18, vr17 vilvl.w vr0, vr21, vr20 vilvh.w vr1, vr21, vr20 vilvh.h vr20, vr15, vr13 vilvh.h vr21, vr18, vr17 vilvl.w vr2, vr21, vr20 vilvh.w vr3, vr21, vr20 vshuf4i.h vr0, vr0, 0x2d vshuf4i.h vr1, vr1, 0x2d vshuf4i.h vr2, vr2, 0x78 vshuf4i.h vr3, vr3, 0x78 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \ vr22, vr15, vr16, vr17, vr18 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 vsrari.h vr18, vr18, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 endfunc function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx la.local t0, idct_coeffs vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 vshuf4i.d vr1, vr1, 0x01 vshuf4i.d vr3, vr3, 0x01 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr0, vr5, vr4 vilvh.h vr1, vr5, vr4 vilvl.h vr4, vr3, vr2 vilvh.h vr5, vr3, vr2 vilvl.h vr2, vr5, vr4 vilvh.h vr3, vr5, vr4 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 // in0 vexth.w.h vr11, vr0 // in1 vsllwil.w.h vr12, vr1, 0 // in2 vexth.w.h vr13, vr1 // in3 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 endfunc function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 adst8x4_1d_lsx vilvl.h vr20, vr15, vr13 vilvl.h vr21, vr18, vr17 vilvl.w vr0, vr21, vr20 vilvh.w vr1, vr21, vr20 vilvh.h vr20, vr15, vr13 vilvh.h vr21, vr18, vr17 vilvl.w vr2, vr21, vr20 vilvh.w vr3, vr21, vr20 vshuf4i.h vr0, vr0, 0x2d vshuf4i.h vr1, vr1, 0x2d vshuf4i.h vr2, vr2, 0x78 vshuf4i.h vr3, vr3, 0x78 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr2, 0 // in0 vexth.w.h vr11, vr2 // in1 vsllwil.w.h vr12, vr3, 0 // in2 vexth.w.h vr13, vr3 // in3 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr0, 0 vexth.w.h vr15, vr0 vsllwil.w.h vr16, vr1, 0 vexth.w.h vr17, vr1 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 endfunc function inv_txfm_add_dct_identity_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 vshuf4i.d vr1, vr1, 0x01 vshuf4i.d vr3, vr3, 0x01 vilvl.h vr4, vr1, vr0 vilvh.h vr5, vr1, vr0 vilvl.h vr0, vr5, vr4 vilvh.h vr1, vr5, vr4 vilvl.h vr4, vr3, vr2 vilvh.h vr5, vr3, vr2 vilvl.h vr2, vr5, vr4 vilvh.h vr3, vr5, vr4 vilvl.d vr14, vr2, vr0 vilvh.d vr15, vr2, vr0 vilvl.d vr16, vr3, vr1 vilvh.d vr17, vr3, vr1 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_identity_dct_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ vr19, vr7, vr9, vr11 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vilvl.h vr4, vr7, vr19 vilvh.h vr5, vr7, vr19 vilvl.h vr0, vr5, vr4 vilvh.h vr1, vr5, vr4 vilvl.h vr4, vr11, vr9 vilvh.h vr5, vr11, vr9 vilvl.h vr2, vr5, vr4 vilvh.h vr3, vr5, vr4 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ vr22, vr15, vr16, vr17, vr18 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 vsrari.h vr18, vr18, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 endfunc function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 adst8x4_1d_lsx vilvl.h vr20, vr15, vr13 vilvl.h vr21, vr18, vr17 vilvl.w vr0, vr21, vr20 vilvh.w vr1, vr21, vr20 vilvh.h vr20, vr15, vr13 vilvh.h vr21, vr18, vr17 vilvl.w vr2, vr21, vr20 vilvh.w vr3, vr21, vr20 vshuf4i.h vr0, vr0, 0x2d vshuf4i.h vr1, vr1, 0x2d vshuf4i.h vr2, vr2, 0x78 vshuf4i.h vr3, vr3, 0x78 vilvl.d vr14, vr0, vr2 // in0 vilvh.d vr15, vr0, vr2 // in1 vilvl.d vr16, vr1, vr3 // in2 vilvh.d vr17, vr1, vr3 // in3 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ vr19, vr7, vr9, vr11 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vilvl.h vr4, vr7, vr19 vilvh.h vr5, vr7, vr19 vilvl.h vr0, vr5, vr4 vilvh.h vr1, vr5, vr4 vilvl.h vr4, vr11, vr9 vilvh.h vr5, vr11, vr9 vilvl.h vr2, vr5, vr4 vilvh.h vr3, vr5, vr4 la.local t0, iadst4_coeffs vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 // in0 vexth.w.h vr11, vr0 // in1 vsllwil.w.h vr12, vr1, 0 // in2 vexth.w.h vr13, vr1 // in3 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 endfunc function inv_txfm_add_adst_identity_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 adst8x4_1d_lsx vilvl.h vr4, vr17, vr13 vilvl.h vr5, vr15, vr18 vilvl.w vr14, vr5, vr4 // in0 in1 vilvh.w vr16, vr5, vr4 // in2 in3 vilvh.h vr4, vr18, vr15 vilvh.h vr5, vr13, vr17 vilvl.w vr17, vr5, vr4 vilvh.w vr18, vr5, vr4 vilvl.d vr10, vr17, vr14 // in0 vilvh.d vr11, vr17, vr14 // in1 vilvl.d vr12, vr18, vr16 // in2 vilvh.d vr13, vr18, vr16 // in3 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 li.w t0, 1697 vreplgr2vr.w vr20, t0 identity_4x4_lsx vr10, vr10, vr20, vr10, vr15 identity_4x4_lsx vr11, vr11, vr20, vr11, vr16 identity_4x4_lsx vr12, vr12, vr20, vr12, vr17 identity_4x4_lsx vr13, vr13, vr20, vr13, vr18 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 vsrari.h vr18, vr18, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 endfunc function inv_txfm_add_identity_adst_8x4_8bpc_lsx vld vr0, a2, 0 // in0 vld vr1, a2, 16 // in1 vld vr2, a2, 32 // in2 vld vr3, a2, 48 // in3 la.local t0, idct_coeffs vldrepl.w vr20, t0, 0 // 2896 rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ vr0, vr1, vr2, vr3 vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7 vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15 vreplgr2vr.h vr23, zero vst vr23, a2, 0 vst vr23, a2, 16 vst vr23, a2, 32 vst vr23, a2, 48 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, vr0, 0 vexth.w.h vr11, vr0 vsllwil.w.h vr12, vr1, 0 vexth.w.h vr13, vr1 adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vsllwil.w.h vr14, vr2, 0 vexth.w.h vr15, vr2 vsllwil.w.h vr16, vr3, 0 vexth.w.h vr17, vr3 adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 vssrarni.h.w vr14, vr10, 12 vssrarni.h.w vr15, vr11, 12 vssrarni.h.w vr16, vr12, 12 vssrarni.h.w vr17, vr13, 12 vsrari.h vr14, vr14, 4 vsrari.h vr15, vr15, 4 vsrari.h vr16, vr16, 4 vsrari.h vr17, vr17, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 endfunc function inv_txfm_add_identity_identity_8x8_8bpc_lsx vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 // identity8 vsllwil.w.h vr6, vr0, 1 vsllwil.w.h vr7, vr1, 1 vsllwil.w.h vr8, vr2, 1 vsllwil.w.h vr9, vr3, 1 vsllwil.w.h vr10, vr4, 1 vsllwil.w.h vr11, vr5, 1 vsllwil.w.h vr12, vr14, 1 vsllwil.w.h vr13, vr15, 1 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 vexth.w.h \i, \i .endr .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 vslli.w \i, \i, 1 .endr vssrarni.h.w vr0, vr6, 1 // in0 vssrarni.h.w vr1, vr7, 1 // in1 vssrarni.h.w vr2, vr8, 1 // in2 vssrarni.h.w vr3, vr9, 1 // in3 vssrarni.h.w vr4, vr10, 1 // in4 vssrarni.h.w vr5, vr11, 1 // in5 vssrarni.h.w vr14, vr12, 1 // in6 vssrarni.h.w vr15, vr13, 1 // in7 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13 vsllwil.w.h vr6, vr16, 1 vsllwil.w.h vr7, vr17, 1 vsllwil.w.h vr8, vr18, 1 vsllwil.w.h vr9, vr19, 1 vsllwil.w.h vr10, vr20, 1 vsllwil.w.h vr11, vr21, 1 vsllwil.w.h vr12, vr22, 1 vsllwil.w.h vr13, vr23, 1 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vexth.w.h \i, \i .endr .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vslli.w \i, \i, 1 .endr vssrarni.h.w vr16, vr6, 4 // in0 vssrarni.h.w vr17, vr7, 4 // in1 vssrarni.h.w vr18, vr8, 4 // in2 vssrarni.h.w vr19, vr9, 4 // in3 vssrarni.h.w vr20, vr10, 4 // in4 vssrarni.h.w vr21, vr11, 4 // in5 vssrarni.h.w vr22, vr12, 4 // in6 vssrarni.h.w vr23, vr13, 4 // in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 endfunc .macro adst8x8_1d_lsx out0, out1, out2, out3 la.local t0, iadst8_coeffs vldrepl.w vr20, t0, 0 // 4076 vldrepl.w vr21, t0, 4 // 401 vldrepl.w vr22, t0, 8 // 3612 vldrepl.w vr23, t0, 12 // 1931 // vr13 t0a t1a vr15 t2a t3a vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 vldrepl.w vr20, t0, 16 // 2598 vldrepl.w vr21, t0, 20 // 3166 vldrepl.w vr22, t0, 24 // 1189 vldrepl.w vr23, t0, 28 // 3920 // vr18 t4a t5a vr6 t6a t7a vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 vsadd.h vr12, vr13, vr18 // t0 t1 vsadd.h vr14, vr15, vr6 // t2 t3 vssub.h vr9, vr13, vr18 // t4 t5 vssub.h vr18, vr15, vr6 // t6 t7 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 vsllwil.w.h vr7, vr9, 0 // t4 vexth.w.h vr8, vr9 // t5 vsllwil.w.h vr10, vr18, 0 // t6 vexth.w.h vr11, vr18 // t7 // vr13 out0 out7 vr17 out1 out6 vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19 vshuf4i.d vr19, vr19, 0x01 vsadd.h vr13, vr12, vr14 // out0 out7 vssub.h vr6, vr12, vr14 // t2 t3 vsadd.h vr7, vr15, vr19 // out1 out6 vssub.h vr18, vr15, vr19 // t6 t7 vexth.w.h vr20, vr13 // out7 vsllwil.w.h vr21, vr7, 0 // out1 vneg.w vr20, vr20 vneg.w vr21, vr21 vssrarni.h.w vr21, vr20, 0 // out7 out1 vilvl.d \out0, vr21, vr13 // out0 out7 vilvh.d \out1, vr7, vr21 // out1 out6 vsllwil.w.h vr7, vr6, 0 // t2 vexth.w.h vr8, vr6 // t3 vsllwil.w.h vr10, vr18, 0 // t6 vexth.w.h vr11, vr18 // t7 // vr15 out[3] out[4] vr18 out[2] out[5] vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 vexth.w.h vr20, vr18 // out5 vsllwil.w.h vr21, vr15, 0 // out3 vneg.w vr20, vr20 vneg.w vr21, vr21 vssrarni.h.w vr21, vr20, 0 // out5 out3 vilvl.d \out2, vr21, vr18 // out2 out5 vilvh.d \out3, vr15, vr21 // out3 out4 .endm function inv_txfm_add_adst_dct_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr24, vr25, vr26, vr27 vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 vshuf4i.h vr14, vr14, 0x1b vshuf4i.h vr15, vr15, 0x1b vshuf4i.h vr24, vr24, 0x1b vshuf4i.h vr25, vr25, 0x1b vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr12 vexth.w.h vr11, vr13 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr4, vr5, vr12, vr13 vshuf4i.d vr5, vr5, 0x01 vshuf4i.d vr13, vr13, 0x01 vsllwil.w.h vr18, vr14, 0 vsllwil.w.h vr19, vr15, 0 vsllwil.w.h vr6, vr24, 0 vsllwil.w.h vr7, vr25, 0 vexth.w.h vr8, vr14 vexth.w.h vr9, vr15 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr14, vr15, vr24, vr25 vshuf4i.d vr15, vr15, 0x01 vshuf4i.d vr25, vr25, 0x01 vilvl.d vr20, vr14, vr4 vilvh.d vr21, vr14, vr4 vilvl.d vr22, vr15, vr5 vilvh.d vr23, vr15, vr5 vilvl.d vr16, vr24, vr12 vilvh.d vr17, vr24, vr12 vilvl.d vr18, vr25, vr13 vilvh.d vr19, vr25, vr13 .irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_dct_adst_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vsllwil.w.h vr8, vr14, 0 vsllwil.w.h vr9, vr15, 0 vsllwil.w.h vr10, vr24, 0 vsllwil.w.h vr11, vr25, 0 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 vshuf4i.d vr27, vr27, 0x01 vshuf4i.d vr29, vr29, 0x01 vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14 vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15 vilvl.h vr26, vr9, vr8 // 0 - 7 in0 vilvh.h vr27, vr9, vr8 // 8 - 15 in1 vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14 vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15 vilvl.h vr28, vr9, vr8 // 16 - 23 in2 vilvh.h vr29, vr9, vr8 // 24 - 31 in3 vsrari.h vr26, vr26, 1 // in0low in1low vsrari.h vr27, vr27, 1 // in2low in3low vsrari.h vr28, vr28, 1 // in0high in1high vsrari.h vr29, vr29, 1 // in2high in3high vexth.w.h vr18, vr4 vexth.w.h vr19, vr5 vexth.w.h vr6, vr12 vexth.w.h vr7, vr13 vexth.w.h vr8, vr14 vexth.w.h vr9, vr15 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 vshuf4i.d vr13, vr13, 0x01 vshuf4i.d vr15, vr15, 0x01 vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14 vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15 vilvl.h vr12, vr9, vr8 // 0 - 7 in0 vilvh.h vr13, vr9, vr8 // 8 - 15 in1 vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14 vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15 vilvl.h vr14, vr9, vr8 // 16 - 23 in2 vilvh.h vr15, vr9, vr8 // 24 - 31 in3 vsrari.h vr0, vr12, 1 // in4low in5low vsrari.h vr1, vr13, 1 // in6low in7low vsrari.h vr2, vr14, 1 // in4high in5high vsrari.h vr3, vr15, 1 // in6high in7high vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsllwil.w.h vr18, vr26, 0 // in0 vexth.w.h vr19, vr26 // in1 vsllwil.w.h vr6, vr27, 0 // in2 vexth.w.h vr7, vr27 // in3 vsllwil.w.h vr8, vr0, 0 // in3 vexth.w.h vr9, vr0 // in4 vsllwil.w.h vr10, vr1, 0 // in5 vexth.w.h vr11, vr1 // in6 adst8x8_1d_lsx vr26, vr27, vr0, vr1 vsllwil.w.h vr18, vr28, 0 // in0 vexth.w.h vr19, vr28 // in1 vsllwil.w.h vr6, vr29, 0 // in2 vexth.w.h vr7, vr29 // in3 vsllwil.w.h vr8, vr2, 0 // in4 vexth.w.h vr9, vr2 // in5 vsllwil.w.h vr10, vr3, 0 // in6 vexth.w.h vr11, vr3 // in7 adst8x8_1d_lsx vr28, vr29, vr16, vr17 vilvl.d vr4, vr28, vr26 // 0 ... 7 vilvl.d vr5, vr29, vr27 // 8 ... 15 vilvl.d vr6, vr16, vr0 // 16 ... 23 vilvl.d vr7, vr17, vr1 // 24 ... 31 vilvh.d vr14, vr17, vr1 // 32 ... 39 vilvh.d vr15, vr16, vr0 // 40 ... 47 vilvh.d vr16, vr29, vr27 // 48 ... 55 vilvh.d vr17, vr28, vr26 // 56 ... 63 .irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc function inv_txfm_add_adst_adst_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr24, vr25, vr26, vr27 vexth.w.h vr18, vr0 // in0 vexth.w.h vr19, vr1 // in1 vexth.w.h vr6, vr2 // in2 vexth.w.h vr7, vr3 // in3 vexth.w.h vr8, vr4 // in3 vexth.w.h vr9, vr5 // in4 vexth.w.h vr10, vr16 // in5 vexth.w.h vr11, vr17 // in6 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 vshuf4i.h vr4, vr4, 0x1b vshuf4i.h vr5, vr5, 0x1b vshuf4i.h vr24, vr24, 0x1b vshuf4i.h vr25, vr25, 0x1b vsllwil.w.h vr18, vr14, 0 vsllwil.w.h vr19, vr15, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vexth.w.h vr8, vr14 // in3 vexth.w.h vr9, vr15 // in4 vexth.w.h vr10, vr12 // in5 vexth.w.h vr11, vr13 // in6 adst8x8_1d_lsx vr26, vr27, vr0, vr1 vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr24, 0 vsllwil.w.h vr7, vr25, 0 vexth.w.h vr8, vr4 // in3 vexth.w.h vr9, vr5 // in4 vexth.w.h vr10, vr24 // in5 vexth.w.h vr11, vr25 // in6 adst8x8_1d_lsx vr24, vr25, vr16, vr17 vilvl.d vr4, vr24, vr26 // 0 ... 7 vilvl.d vr5, vr25, vr27 // 8 ... 15 vilvl.d vr6, vr16, vr0 // 16 ... 23 vilvl.d vr7, vr17, vr1 // 24 ... 31 vilvh.d vr14, vr17, vr1 // 32 ... 39 vilvh.d vr15, vr16, vr0 // 40 ... 47 vilvh.d vr16, vr25, vr27 // 48 ... 55 vilvh.d vr17, vr24, vr26 // 56 ... 63 .irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr24, vr20, vr21 vilvh.w vr25, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr26, vr20, vr21 vilvh.w vr27, vr20, vr21 vshuf4i.h vr26, vr26, 0x1b vshuf4i.h vr27, vr27, 0x1b vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr0, vr20, vr21 vilvh.w vr1, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr2, vr20, vr21 vilvh.w vr3, vr20, vr21 vshuf4i.h vr2, vr2, 0x1b vshuf4i.h vr3, vr3, 0x1b vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr vsllwil.w.h vr18, vr26, 0 // in0 vexth.w.h vr19, vr26 // in1 vsllwil.w.h vr6, vr27, 0 // in2 vexth.w.h vr7, vr27 // in3 vsllwil.w.h vr8, vr2, 0 // in4 vexth.w.h vr9, vr2 // in5 vsllwil.w.h vr10, vr3, 0 // in6 vexth.w.h vr11, vr3 // in7 adst8x8_1d_lsx vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr24, 0 // in0 vexth.w.h vr19, vr24 // in1 vsllwil.w.h vr6, vr25, 0 // in2 vexth.w.h vr7, vr25 // in3 vsllwil.w.h vr8, vr0, 0 // in4 vexth.w.h vr9, vr0 // in5 vsllwil.w.h vr10, vr1, 0 // in6 vexth.w.h vr11, vr1 // in7 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvl.d vr20, vr0, vr4 // 0 ... 7 vilvl.d vr21, vr1, vr5 // 8 ... 15 vilvl.d vr22, vr2, vr16 // 16 ... 23 vilvl.d vr23, vr3, vr17 // 24 ... 31 vilvh.d vr14, vr3, vr17 // 32 ... 39 vilvh.d vr15, vr2, vr16 // 40 ... 47 vilvh.d vr16, vr1, vr5 // 48 ... 55 vilvh.d vr17, vr0, vr4 // 56 ... 63 .irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr24, vr25, vr26, vr27 vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 vshuf4i.h vr0, vr0, 0x1b vshuf4i.h vr1, vr1, 0x1b vshuf4i.h vr2, vr2, 0x1b vshuf4i.h vr3, vr3, 0x1b vsllwil.w.h vr18, vr0, 0 // in0 vsllwil.w.h vr19, vr1, 0 // in1 vsllwil.w.h vr6, vr2, 0 // in2 vsllwil.w.h vr7, vr3, 0 // in3 vexth.w.h vr8, vr0 // in4 vexth.w.h vr9, vr1 // in5 vexth.w.h vr10, vr2 // in6 vexth.w.h vr11, vr3 // in7 adst8x8_1d_lsx vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr24, 0 // in0 vsllwil.w.h vr19, vr25, 0 // in1 vsllwil.w.h vr6, vr26, 0 // in2 vsllwil.w.h vr7, vr27, 0 // in3 vexth.w.h vr8, vr24 // in4 vexth.w.h vr9, vr25 // in5 vexth.w.h vr10, vr26 // in6 vexth.w.h vr11, vr27 // in7 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvh.d vr20, vr4, vr0 vilvh.d vr21, vr5, vr1 vilvh.d vr22, vr16, vr2 vilvh.d vr23, vr17, vr3 vilvl.d vr14, vr17, vr3 vilvl.d vr15, vr16, vr2 vilvl.d vr18, vr5, vr1 vilvl.d vr19, vr4, vr0 .irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr18, vr19 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr24, vr20, vr21 vilvh.w vr25, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr26, vr20, vr21 vilvh.w vr27, vr20, vr21 vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr0, vr20, vr21 vilvh.w vr1, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr2, vr20, vr21 vilvh.w vr3, vr20, vr21 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsrari.h vr24, vr24, 1 vsrari.h vr25, vr25, 1 vsrari.h vr26, vr26, 1 vsrari.h vr27, vr27, 1 vsrari.h vr14, vr0, 1 vsrari.h vr15, vr1, 1 vsrari.h vr16, vr2, 1 vsrari.h vr17, vr3, 1 vsllwil.w.h vr18, vr26, 0 vexth.w.h vr19, vr26 vsllwil.w.h vr6, vr27, 0 vexth.w.h vr7, vr27 vsllwil.w.h vr8, vr16, 0 vexth.w.h vr9, vr16 vsllwil.w.h vr10, vr17, 0 vexth.w.h vr11, vr17 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr26, vr27, vr16, vr17 vshuf4i.h vr26, vr26, 0x1b vshuf4i.h vr27, vr27, 0x1b vshuf4i.h vr16, vr16, 0x1b vshuf4i.h vr17, vr17, 0x1b vsllwil.w.h vr18, vr24, 0 vexth.w.h vr19, vr24 vsllwil.w.h vr6, vr25, 0 vexth.w.h vr7, vr25 vsllwil.w.h vr8, vr14, 0 vexth.w.h vr9, vr14 vsllwil.w.h vr10, vr15, 0 vexth.w.h vr11, vr15 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr24, vr25, vr14, vr15 vilvl.d vr4, vr24, vr26 vilvh.d vr5, vr24, vr26 vilvh.d vr6, vr25, vr27 vilvl.d vr7, vr25, vr27 vilvl.d vr24, vr14, vr16 vilvh.d vr25, vr14, vr16 vilvh.d vr26, vr15, vr17 vilvl.d vr27, vr15, vr17 .irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vsllwil.w.h vr8, vr14, 0 vsllwil.w.h vr9, vr15, 0 vsllwil.w.h vr10, vr24, 0 vsllwil.w.h vr11, vr25, 0 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 vshuf4i.d vr27, vr27, 0x01 vshuf4i.d vr29, vr29, 0x01 vilvl.h vr8, vr27, vr26 vilvh.h vr9, vr27, vr26 vilvl.h vr26, vr9, vr8 vilvh.h vr27, vr9, vr8 vilvl.h vr8, vr29, vr28 vilvh.h vr9, vr29, vr28 vilvl.h vr28, vr9, vr8 vilvh.h vr29, vr9, vr8 vsrari.h vr26, vr26, 1 // in0low in1low vsrari.h vr27, vr27, 1 // in2low in3low vsrari.h vr28, vr28, 1 // in0high in1high vsrari.h vr29, vr29, 1 // in2high in3high vexth.w.h vr18, vr4 vexth.w.h vr19, vr5 vexth.w.h vr6, vr12 vexth.w.h vr7, vr13 vexth.w.h vr8, vr14 vexth.w.h vr9, vr15 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 vshuf4i.d vr13, vr13, 0x01 vshuf4i.d vr15, vr15, 0x01 vilvl.h vr8, vr13, vr12 vilvh.h vr9, vr13, vr12 vilvl.h vr12, vr9, vr8 vilvh.h vr13, vr9, vr8 vilvl.h vr8, vr15, vr14 vilvh.h vr9, vr15, vr14 vilvl.h vr14, vr9, vr8 vilvh.h vr15, vr9, vr8 vsrari.h vr0, vr12, 1 vsrari.h vr1, vr13, 1 vsrari.h vr2, vr14, 1 vsrari.h vr3, vr15, 1 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsllwil.w.h vr18, vr28, 0 // in0 vexth.w.h vr19, vr28 // in1 vsllwil.w.h vr6, vr29, 0 // in2 vexth.w.h vr7, vr29 // in3 vsllwil.w.h vr8, vr2, 0 // in4 vexth.w.h vr9, vr2 // in5 vsllwil.w.h vr10, vr3, 0 // in6 vexth.w.h vr11, vr3 // in7 adst8x8_1d_lsx vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr26, 0 // in0 vexth.w.h vr19, vr26 // in1 vsllwil.w.h vr6, vr27, 0 // in2 vexth.w.h vr7, vr27 // in3 vsllwil.w.h vr8, vr0, 0 // in4 vexth.w.h vr9, vr0 // in5 vsllwil.w.h vr10, vr1, 0 // in6 vexth.w.h vr11, vr1 // in7 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvh.d vr26, vr4, vr0 vilvh.d vr27, vr5, vr1 vilvh.d vr28, vr16, vr2 vilvh.d vr29, vr17, vr3 vilvl.d vr20, vr17, vr3 vilvl.d vr21, vr16, vr2 vilvl.d vr22, vr5, vr1 vilvl.d vr23, vr4, vr0 .irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr26, vr27, vr28, vr29 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr24, vr20, vr21 vilvh.w vr25, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr26, vr20, vr21 vilvh.w vr27, vr20, vr21 vshuf4i.h vr26, vr26, 0x1b vshuf4i.h vr27, vr27, 0x1b vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr0, vr20, vr21 vilvh.w vr1, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr2, vr20, vr21 vilvh.w vr3, vr20, vr21 vshuf4i.h vr2, vr2, 0x1b vshuf4i.h vr3, vr3, 0x1b .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsllwil.w.h vr18, vr26, 0 // in0 vexth.w.h vr19, vr26 // in1 vsllwil.w.h vr6, vr27, 0 // in2 vexth.w.h vr7, vr27 // in3 vsllwil.w.h vr8, vr2, 0 // in4 vexth.w.h vr9, vr2 // in5 vsllwil.w.h vr10, vr3, 0 // in6 vexth.w.h vr11, vr3 // in7 adst8x8_1d_lsx vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr24, 0 // in0 vexth.w.h vr19, vr24 // in1 vsllwil.w.h vr6, vr25, 0 // in2 vexth.w.h vr7, vr25 // in3 vsllwil.w.h vr8, vr0, 0 // in4 vexth.w.h vr9, vr0 // in5 vsllwil.w.h vr10, vr1, 0 // in6 vexth.w.h vr11, vr1 // in7 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvh.d vr24, vr0, vr4 vilvh.d vr25, vr1, vr5 vilvh.d vr26, vr2, vr16 vilvh.d vr27, vr3, vr17 vilvl.d vr20, vr3, vr17 vilvl.d vr21, vr2, vr16 vilvl.d vr22, vr1, vr5 vilvl.d vr23, vr0, vr4 .irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_dct_identity_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vsllwil.w.h vr8, vr14, 0 vsllwil.w.h vr9, vr15, 0 vsllwil.w.h vr10, vr24, 0 vsllwil.w.h vr11, vr25, 0 dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 vshuf4i.d vr27, vr27, 0x01 vshuf4i.d vr29, vr29, 0x01 vilvl.h vr8, vr27, vr26 vilvh.h vr9, vr27, vr26 vilvl.h vr26, vr9, vr8 vilvh.h vr27, vr9, vr8 vilvl.h vr8, vr29, vr28 vilvh.h vr9, vr29, vr28 vilvl.h vr28, vr9, vr8 vilvh.h vr29, vr9, vr8 vsrari.h vr26, vr26, 1 // in0low in1low vsrari.h vr27, vr27, 1 // in2low in3low vsrari.h vr28, vr28, 1 // in0high in1high vsrari.h vr29, vr29, 1 // in2high in3high vexth.w.h vr18, vr4 vexth.w.h vr19, vr5 vexth.w.h vr6, vr12 vexth.w.h vr7, vr13 vexth.w.h vr8, vr14 vexth.w.h vr9, vr15 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 vshuf4i.d vr13, vr13, 0x01 vshuf4i.d vr15, vr15, 0x01 vilvl.h vr8, vr13, vr12 vilvh.h vr9, vr13, vr12 vilvl.h vr12, vr9, vr8 vilvh.h vr13, vr9, vr8 vilvl.h vr8, vr15, vr14 vilvh.h vr9, vr15, vr14 vilvl.h vr14, vr9, vr8 vilvh.h vr15, vr9, vr8 vsrari.h vr20, vr12, 1 vsrari.h vr21, vr13, 1 vsrari.h vr22, vr14, 1 vsrari.h vr23, vr15, 1 vreplgr2vr.h vr19, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr19, a2, \i .endr // identity8 vsllwil.w.h vr10, vr26, 1 vsllwil.w.h vr11, vr27, 1 vsllwil.w.h vr16, vr28, 1 vsllwil.w.h vr17, vr29, 1 vsllwil.w.h vr6, vr20, 1 vsllwil.w.h vr7, vr21, 1 vsllwil.w.h vr18, vr22, 1 vsllwil.w.h vr19, vr23, 1 .irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 vexth.w.h \i, \i .endr .irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 vslli.w \i, \i, 1 .endr vssrarni.h.w vr16, vr10, 4 // in0 vssrarni.h.w vr28, vr26, 4 // in1 vssrarni.h.w vr17, vr11, 4 // in2 vssrarni.h.w vr29, vr27, 4 // in3 vssrarni.h.w vr18, vr6, 4 // in4 vssrarni.h.w vr22, vr20, 4 // in5 vssrarni.h.w vr19, vr7, 4 // in6 vssrarni.h.w vr23, vr21, 4 // in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr28, vr17, vr29 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr18, vr22, vr19, vr23 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc function inv_txfm_add_identity_dct_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 // identity8 vsllwil.w.h vr6, vr0, 1 vsllwil.w.h vr7, vr1, 1 vsllwil.w.h vr8, vr2, 1 vsllwil.w.h vr9, vr3, 1 vsllwil.w.h vr10, vr4, 1 vsllwil.w.h vr11, vr5, 1 vsllwil.w.h vr12, vr24, 1 vsllwil.w.h vr13, vr25, 1 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vexth.w.h \i, \i .endr .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vslli.w \i, \i, 1 .endr vssrarni.h.w vr0, vr6, 1 // in0 vssrarni.h.w vr1, vr7, 1 // in1 vssrarni.h.w vr2, vr8, 1 // in2 vssrarni.h.w vr3, vr9, 1 // in3 vssrarni.h.w vr4, vr10, 1 // in4 vssrarni.h.w vr5, vr11, 1 // in5 vssrarni.h.w vr24, vr12, 1 // in6 vssrarni.h.w vr25, vr13, 1 // in7 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 // dct4 in0 in2 in4 in6 vsllwil.w.h vr18, vr4, 0 vsllwil.w.h vr19, vr5, 0 vsllwil.w.h vr6, vr12, 0 vsllwil.w.h vr7, vr13, 0 vsllwil.w.h vr8, vr14, 0 vsllwil.w.h vr9, vr15, 0 vsllwil.w.h vr10, vr24, 0 vsllwil.w.h vr11, vr25, 0 dct_8x4_core_lsx1 vr16, vr17, vr26, vr27 vexth.w.h vr18, vr4 vexth.w.h vr19, vr5 vexth.w.h vr6, vr12 vexth.w.h vr7, vr13 vexth.w.h vr8, vr14 vexth.w.h vr9, vr15 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vldrepl.w vr22, t0, 0 // 2896 dct_8x4_core_lsx1 vr4, vr5, vr24, vr25 vilvl.d vr8, vr4, vr16 vilvh.d vr9, vr4, vr16 vilvh.d vr6, vr5, vr17 vilvl.d vr7, vr5, vr17 vilvl.d vr16, vr24, vr26 vilvh.d vr17, vr24, vr26 vilvh.d vr18, vr25, vr27 vilvl.d vr19, vr25, vr27 .irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr8, vr9, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr24, vr20, vr21 vilvh.w vr25, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr26, vr20, vr21 vilvh.w vr27, vr20, vr21 vshuf4i.h vr26, vr26, 0x1b vshuf4i.h vr27, vr27, 0x1b vexth.w.h vr18, vr0 // in0 vexth.w.h vr19, vr1 // in1 vexth.w.h vr6, vr2 // in2 vexth.w.h vr7, vr3 // in3 vexth.w.h vr8, vr4 // in3 vexth.w.h vr9, vr5 // in4 vexth.w.h vr10, vr16 // in5 vexth.w.h vr11, vr17 // in6 adst8x8_1d_lsx vr12, vr13, vr14, vr15 vilvl.h vr20, vr12, vr13 vilvl.h vr21, vr14, vr15 vilvl.w vr16, vr20, vr21 vilvh.w vr17, vr20, vr21 vilvh.h vr20, vr12, vr13 vilvh.h vr21, vr14, vr15 vilvl.w vr18, vr20, vr21 vilvh.w vr19, vr20, vr21 vshuf4i.h vr18, vr18, 0x1b vshuf4i.h vr19, vr19, 0x1b vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 vsrari.h \i, \i, 1 .endr // identity8 vsllwil.w.h vr20, vr24, 1 vsllwil.w.h vr21, vr25, 1 vsllwil.w.h vr12, vr26, 1 vsllwil.w.h vr13, vr27, 1 vsllwil.w.h vr22, vr16, 1 vsllwil.w.h vr23, vr17, 1 vsllwil.w.h vr14, vr18, 1 vsllwil.w.h vr15, vr19, 1 .irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 vexth.w.h \i, \i .endr .irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 vslli.w \i, \i, 1 .endr vssrarni.h.w vr20, vr12, 4 // in0 vssrarni.h.w vr24, vr26, 4 // in1 vssrarni.h.w vr21, vr13, 4 // in2 vssrarni.h.w vr25, vr27, 4 // in3 vssrarni.h.w vr22, vr14, 4 // in4 vssrarni.h.w vr16, vr18, 4 // in5 vssrarni.h.w vr23, vr15, 4 // in6 vssrarni.h.w vr17, vr19, 4 // in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr24, vr21, vr25 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr22, vr16, vr23, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 // identity8 vsllwil.w.h vr6, vr0, 1 vsllwil.w.h vr7, vr1, 1 vsllwil.w.h vr8, vr2, 1 vsllwil.w.h vr9, vr3, 1 vsllwil.w.h vr10, vr4, 1 vsllwil.w.h vr11, vr5, 1 vsllwil.w.h vr12, vr24, 1 vsllwil.w.h vr13, vr25, 1 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vexth.w.h \i, \i .endr .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vslli.w \i, \i, 1 .endr vssrarni.h.w vr0, vr6, 1 // in0 vssrarni.h.w vr1, vr7, 1 // in1 vssrarni.h.w vr2, vr8, 1 // in2 vssrarni.h.w vr3, vr9, 1 // in3 vssrarni.h.w vr4, vr10, 1 // in4 vssrarni.h.w vr5, vr11, 1 // in5 vssrarni.h.w vr24, vr12, 1 // in6 vssrarni.h.w vr25, vr13, 1 // in7 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsllwil.w.h vr18, vr0, 0 // in0 vsllwil.w.h vr19, vr1, 0 // in1 vsllwil.w.h vr6, vr2, 0 // in2 vsllwil.w.h vr7, vr3, 0 // in3 vsllwil.w.h vr8, vr4, 0 // in3 vsllwil.w.h vr9, vr5, 0 // in4 vsllwil.w.h vr10, vr24, 0 // in5 vsllwil.w.h vr11, vr25, 0 // in6 adst8x8_1d_lsx vr26, vr27, vr28, vr29 vexth.w.h vr18, vr0 // in0 vexth.w.h vr19, vr1 // in1 vexth.w.h vr6, vr2 // in2 vexth.w.h vr7, vr3 // in3 vexth.w.h vr8, vr4 // in3 vexth.w.h vr9, vr5 // in4 vexth.w.h vr10, vr24 // in5 vexth.w.h vr11, vr25 // in6 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvh.d vr4, vr0, vr26 vilvh.d vr5, vr1, vr27 vilvh.d vr6, vr2, vr28 vilvh.d vr7, vr3, vr29 vilvl.d vr14, vr3, vr29 vilvl.d vr15, vr2, vr28 vilvl.d vr16, vr1, vr27 vilvl.d vr17, vr0, vr26 .irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc function inv_txfm_add_adst_identity_8x8_8bpc_lsx addi.d sp, sp, -32 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr16, 0 vsllwil.w.h vr11, vr17, 0 adst8x8_1d_lsx vr24, vr25, vr26, vr27 vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr16 vexth.w.h vr11, vr17 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \ vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 vshuf4i.h vr26, vr26, 0x1b vshuf4i.h vr27, vr27, 0x1b vshuf4i.h vr22, vr22, 0x1b vshuf4i.h vr23, vr23, 0x1b // identity8 vsllwil.w.h vr16, vr24, 1 vsllwil.w.h vr17, vr25, 1 vsllwil.w.h vr10, vr20, 1 vsllwil.w.h vr11, vr21, 1 vsllwil.w.h vr18, vr26, 1 vsllwil.w.h vr19, vr27, 1 vsllwil.w.h vr14, vr22, 1 vsllwil.w.h vr15, vr23, 1 .irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 vexth.w.h \i, \i .endr .irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 vslli.w \i, \i, 1 .endr vssrarni.h.w vr18, vr16, 4 // in0 vssrarni.h.w vr19, vr17, 4 // in1 vssrarni.h.w vr14, vr10, 4 // in2 vssrarni.h.w vr15, vr11, 4 // in3 vssrarni.h.w vr26, vr24, 4 // in4 vssrarni.h.w vr27, vr25, 4 // in5 vssrarni.h.w vr22, vr20, 4 // in6 vssrarni.h.w vr23, vr21, 4 // in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr18, vr19, vr14, vr15 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr26, vr27, vr22, vr23 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 32 endfunc function inv_txfm_add_identity_adst_8x8_8bpc_lsx addi.d sp, sp, -48 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 // identity8 vsllwil.w.h vr6, vr0, 1 vsllwil.w.h vr7, vr1, 1 vsllwil.w.h vr8, vr2, 1 vsllwil.w.h vr9, vr3, 1 vsllwil.w.h vr10, vr4, 1 vsllwil.w.h vr11, vr5, 1 vsllwil.w.h vr12, vr24, 1 vsllwil.w.h vr13, vr25, 1 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vexth.w.h \i, \i .endr .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 vslli.w \i, \i, 1 .endr vssrarni.h.w vr0, vr6, 1 // in0 vssrarni.h.w vr1, vr7, 1 // in1 vssrarni.h.w vr2, vr8, 1 // in2 vssrarni.h.w vr3, vr9, 1 // in3 vssrarni.h.w vr4, vr10, 1 // in4 vssrarni.h.w vr5, vr11, 1 // in5 vssrarni.h.w vr24, vr12, 1 // in6 vssrarni.h.w vr25, vr13, 1 // in7 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr vsllwil.w.h vr18, vr0, 0 vsllwil.w.h vr19, vr1, 0 vsllwil.w.h vr6, vr2, 0 vsllwil.w.h vr7, vr3, 0 vsllwil.w.h vr8, vr4, 0 vsllwil.w.h vr9, vr5, 0 vsllwil.w.h vr10, vr24, 0 vsllwil.w.h vr11, vr25, 0 adst8x8_1d_lsx vr26, vr27, vr28, vr29 vexth.w.h vr18, vr0 vexth.w.h vr19, vr1 vexth.w.h vr6, vr2 vexth.w.h vr7, vr3 vexth.w.h vr8, vr4 vexth.w.h vr9, vr5 vexth.w.h vr10, vr24 vexth.w.h vr11, vr25 adst8x8_1d_lsx vr0, vr1, vr2, vr3 vilvl.d vr4, vr0, vr26 // 0 ... 7 vilvl.d vr5, vr1, vr27 // 8 ... 15 vilvl.d vr6, vr2, vr28 // 16 ... 23 vilvl.d vr7, vr3, vr29 // 24 ... 31 vilvh.d vr14, vr3, vr29 // 32 ... 39 vilvh.d vr15, vr2, vr28 // 40 ... 47 vilvh.d vr16, vr1, vr27 // 48 ... 55 vilvh.d vr17, vr0, vr26 // 56 ... 63 .irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 addi.d sp, sp, 48 endfunc .macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmadd.w \out0, vr22, \in3 vmadd.w \out1, vr23, \in3 .endm .macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmsub.w \out0, vr22, \in3 vmsub.w \out1, vr23, \in3 .endm .macro rect2_lsx in0, in1, out0 vsllwil.w.h vr22, \in0, 0 // in1 vexth.w.h \in0, \in0 // in1 vmul.w vr22, vr22, \in1 vmul.w \out0, \in0, \in1 vssrarni.h.w \out0, vr22, 12 .endm .macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ out1, out2, out3, out4, out5, out6, out7, rect2 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9 vssrarni.h.w vr9, vr8, 12 // t3 vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10 vssrarni.h.w vr10, vr8, 12 // t2 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2 vssrarni.h.w \in2, vr8, 12 // t0 vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6 vssrarni.h.w \in6, vr8, 12 // t1 vsadd.h vr8, \in2, vr9 // c[0] vssub.h vr9, \in2, vr9 // c[3] vsadd.h \in0, \in6, vr10 // c[1] vssub.h vr10, \in6, vr10 // c[2] vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4 vssrarni.h.w \in4, \in2, 12 // t7a vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6 vssrarni.h.w \in6, \in2, 12 // t4a vldrepl.w vr20, t0, 24 // 3406 vldrepl.w vr21, t0, 28 // 2276 vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1 vssrarni.h.w \in1, \in2, 12 // t6a vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7 vssrarni.h.w \in7, \in2, 12 // t5a vsadd.h \in3, \in6, \in7 // t4 vssub.h \in6, \in6, \in7 // t5a vsadd.h \in5, \in4, \in1 // t7 vssub.h \in4, \in4, \in1 // t6a vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1 vssrarni.h.w \in1, \in2, 12 // t6 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 vssrarni.h.w \in7, \in2, 12 // t5 vsadd.h \out0, vr8, \in5 // c[0] vssub.h \out7, vr8, \in5 // c[7] vsadd.h \out1, \in0, \in1 // c[1] vssub.h \out6, \in0, \in1 // c[6] vsadd.h \out2, vr10, \in7 // c[2] vssub.h \out5, vr10, \in7 // c[5] vsadd.h \out3, vr9, \in3 // c[3] vssub.h \out4, vr9, \in3 // c[4] .endm function inv_txfm_add_dct_dct_8x8_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x8 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 alsl.d t2, a1, a0, 1 vmadd.w vr5, vr2, vr0 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 vssrarni.h.w vr5, vr5, 12 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 b .DCT_DCT_8X8_END .NO_HAS_DCONLY_8x8: vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsrari.h \i, \i, 1 .endr vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2 .irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 .DCT_DCT_8X8_END: endfunc .macro dct_8x16_core_lsx dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 la.local t0, idct_coeffs vldrepl.w vr20, t0, 32 // 401 vldrepl.w vr21, t0, 36 // 4076 vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 vssrarni.h.w vr10, vr0, 12 // t15a vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 vssrarni.h.w vr29, vr0, 12 // t8a vldrepl.w vr20, t0, 40 // 3166 -> 1583 vldrepl.w vr21, t0, 44 // 2598 -> 1299 vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t14a vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 vssrarni.h.w vr31, vr0, 12 // t9a vldrepl.w vr20, t0, 48 // 1931 vldrepl.w vr21, t0, 52 // 3612 vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 vssrarni.h.w vr24, vr0, 12 // t13a vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 vssrarni.h.w vr25, vr0, 12 // t10a vldrepl.w vr20, t0, 56 // 3920 vldrepl.w vr21, t0, 60 // 1189 vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t12a vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t11a // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 vsadd.h vr28, vr29, vr31 // t8 vssub.h vr19, vr29, vr31 // t9 vssub.h vr29, vr27, vr25 // t10 vsadd.h vr9, vr27, vr25 // t11 vsadd.h vr31, vr26, vr24 // t12 vssub.h vr25, vr26, vr24 // t13 vssub.h vr27, vr10, vr30 // t14 vsadd.h vr24, vr10, vr30 // t15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t14a vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t9a vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 vneg.w vr0, vr0 vneg.w vr19, vr19 vssrarni.h.w vr19, vr0, 12 // t10a vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t13a vsadd.h vr25, vr28, vr9 // t8a vssub.h vr29, vr28, vr9 // t11a vssub.h vr28, vr24, vr31 // t12a vsadd.h vr10, vr24, vr31 // t15a vsadd.h vr9, vr30, vr19 // t9 vssub.h vr31, vr30, vr19 // t10 vssub.h vr30, vr26, vr27 // t13 vsadd.h vr24, vr26, vr27 // t14 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t13a vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t10a vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 vssrarni.h.w vr31, vr0, 12 // t12 vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t11 // vr11 vr12 ... vr18 vsadd.h vr28, vr14, vr31 // c[3] vssub.h vr29, vr14, vr31 // c[12] vsadd.h vr20, vr15, vr30 // c[4] vssub.h vr21, vr15, vr30 // c[11] vsadd.h vr14, vr16, vr27 // c[5] vssub.h vr23, vr16, vr27 // c[10] vsadd.h vr15, vr17, vr9 // c[6] vssub.h vr30, vr17, vr9 // c[9] vsadd.h vr16, vr18, vr25 // c[7] vssub.h vr27, vr18, vr25 // c[8] vsadd.h vr17, vr13, vr26 // c[2] vssub.h vr26, vr13, vr26 // c[13] vsadd.h vr18, vr12, vr24 // c[1] vssub.h vr25, vr12, vr24 // c[14] vsadd.h vr22, vr11, vr10 // c[0] vssub.h vr24, vr11, vr10 // c[15] .endm function inv_txfm_add_dct_dct_8x16_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x16 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 vmul.w vr2, vr0, vr2 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 alsl.d t2, a1, a0, 1 vmadd.w vr5, vr2, vr0 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 vssrarni.h.w vr5, vr5, 12 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 b .DCT_DCT_8X16_END .NO_HAS_DCONLY_8x16: addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vsrari.h \i, \i, 1 .endr vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 dct_8x16_core_lsx .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 .DCT_DCT_8X16_END: endfunc .macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 rect2_lsx \i, vr23, \i .endr .endif vsllwil.w.h vr8, \in0, 1 vsllwil.w.h vr9, \in1, 1 vsllwil.w.h vr10, \in2, 1 vsllwil.w.h vr11, \in3, 1 vsllwil.w.h vr12, \in4, 1 vsllwil.w.h vr13, \in5, 1 vsllwil.w.h vr14, \in6, 1 vsllwil.w.h vr15, \in7, 1 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vexth.w.h \i, \i .endr .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vslli.w \i, \i, 1 .endr vssrarni.h.w \in0, vr8, 1 vssrarni.h.w \in1, vr9, 1 vssrarni.h.w \in2, vr10, 1 vssrarni.h.w \in3, vr11, 1 vssrarni.h.w \in4, vr12, 1 vssrarni.h.w \in5, vr13, 1 vssrarni.h.w \in6, vr14, 1 vssrarni.h.w \in7, vr15, 1 .endm .macro identity_8x16_core_lsx in0, out0 vsadd.h vr10, \in0, \in0 vsllwil.w.h vr8, \in0, 0 vexth.w.h \out0, \in0 vmul.w vr8, vr8, vr20 vmul.w \out0, \out0, vr20 vssrarni.h.w \out0, vr8, 11 vsadd.h \out0, \out0, vr10 .endm function inv_txfm_add_identity_identity_8x16_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27 identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \ vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \ vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27 identity_8x16_core_lsx \i, \i vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr14, vr15, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr18, vr24, vr26 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr28, vr29, vr30, vr31 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr17, vr19, vr25, vr27 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7, rect2 la.local t0, iadst8_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 32 // 2896 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 0 // 4076 vldrepl.w vr21, t0, 4 // 401 vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9 vssrarni.h.w vr9, vr8, 12 // t0a low vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10 vssrarni.h.w vr10, vr8, 12 // t1a low vldrepl.w vr20, t0, 8 // 3612 vldrepl.w vr21, t0, 12 // 1931 vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0 vssrarni.h.w vr0, vr8, 12 // t2a low vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7 vssrarni.h.w vr7, vr8, 12 // t3a low vldrepl.w vr20, t0, 16 // 2598 -> 1299 vldrepl.w vr21, t0, 20 // 3166 -> 1583 vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2 vssrarni.h.w vr2, vr8, 12 // t4a low vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5 vssrarni.h.w vr5, vr8, 12 // t5a low vldrepl.w vr20, t0, 24 // 1189 vldrepl.w vr21, t0, 28 // 3920 vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3 vssrarni.h.w vr3, vr8, 12 // t6a low vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4 vssrarni.h.w vr4, vr8, 12 // t7a low vsadd.h vr1, vr9, vr2 // t0 vssub.h vr6, vr9, vr2 // t4 vsadd.h vr8, vr10, vr5 // t1 vssub.h vr2, vr10, vr5 // t5 vsadd.h vr9, vr0, vr3 // t2 vssub.h vr5, vr0, vr3 // t6 vsadd.h vr10, vr7, vr4 // t3 vssub.h vr0, vr7, vr4 // t7 vldrepl.w vr20, t0, 40 // 1567 vldrepl.w vr21, t0, 44 // 3784 vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4 vssrarni.h.w vr4, vr3, 12 // t4a low vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7 vssrarni.h.w vr7, vr3, 12 // t5a low vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2 vssrarni.h.w vr2, vr3, 12 // t7a low vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6 vssrarni.h.w vr6, vr3, 12 // t6a low vsadd.h \out0, vr1, vr9 // out[0] vssub.h vr5, vr1, vr9 // t2 vsadd.h vr3, vr8, vr10 // out[7] vssub.h vr1, vr8, vr10 // t3 vexth.w.h vr9, vr3 vsllwil.w.h vr21, vr3, 0 vneg.w \out7, vr9 vneg.w vr21, vr21 vssrarni.h.w \out7, vr21, 0 // out[7] vsadd.h vr8, vr4, vr6 // out[1] vssub.h vr10, vr4, vr6 // t6 vexth.w.h vr20, vr8 vsllwil.w.h vr21, vr8, 0 vneg.w \out1, vr20 vneg.w vr21, vr21 vssrarni.h.w \out1, vr21, 0 // out[1] vsadd.h \out6, vr7, vr2 // out[6] vssub.h vr4, vr7, vr2 // t7 vldrepl.w vr20, t0, 32 // 2896 vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6 vssrarni.h.w vr6, vr9, 12 // out[3] vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4 vssrarni.h.w \out4, vr9, 12 // out[4] vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2 vssrarni.h.w \out2, vr9, 12 // out[2] vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5 vssrarni.h.w vr5, vr9, 12 // out[5] vexth.w.h vr20, vr6 vsllwil.w.h vr21, vr6, 0 vneg.w \out3, vr20 vneg.w vr21, vr21 vssrarni.h.w \out3, vr21, 0 // out[3] vexth.w.h vr20, vr5 vsllwil.w.h vr21, vr5, 0 vneg.w \out5, vr20 vneg.w vr21, vr21 vssrarni.h.w \out5, vr21, 0 // out[5] .endm function inv_txfm_add_adst_dct_8x16_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vsrari.h \i, \i, 1 .endr vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 la.local t0, idct_coeffs vldrepl.w vr20, t0, 32 // 401 vldrepl.w vr21, t0, 36 // 4076 vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 vssrarni.h.w vr10, vr0, 12 // t15a vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 vssrarni.h.w vr29, vr0, 12 // t8a vldrepl.w vr20, t0, 40 // 3166 -> 1583 vldrepl.w vr21, t0, 44 // 2598 -> 1299 vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t14a vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 vssrarni.h.w vr31, vr0, 12 // t9a vldrepl.w vr20, t0, 48 // 1931 vldrepl.w vr21, t0, 52 // 3612 vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 vssrarni.h.w vr24, vr0, 12 // t13a vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 vssrarni.h.w vr25, vr0, 12 // t10a vldrepl.w vr20, t0, 56 // 3920 vldrepl.w vr21, t0, 60 // 1189 vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t12a vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t11a // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 vsadd.h vr28, vr29, vr31 // t8 vssub.h vr19, vr29, vr31 // t9 vssub.h vr29, vr27, vr25 // t10 vsadd.h vr9, vr27, vr25 // t11 vsadd.h vr31, vr26, vr24 // t12 vssub.h vr25, vr26, vr24 // t13 vssub.h vr27, vr10, vr30 // t14 vsadd.h vr24, vr10, vr30 // t15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t14a vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t9a vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 vneg.w vr0, vr0 vneg.w vr19, vr19 vssrarni.h.w vr19, vr0, 12 // t10a vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t13a vsadd.h vr25, vr28, vr9 // t8a vssub.h vr29, vr28, vr9 // t11a vssub.h vr28, vr24, vr31 // t12a vsadd.h vr10, vr24, vr31 // t15a vsadd.h vr9, vr30, vr19 // t9 vssub.h vr31, vr30, vr19 // t10 vssub.h vr30, vr26, vr27 // t13 vsadd.h vr24, vr26, vr27 // t14 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 vssrarni.h.w vr26, vr0, 12 // t13a vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 vssrarni.h.w vr27, vr0, 12 // t10a vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 vssrarni.h.w vr31, vr0, 12 // t12 vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 vssrarni.h.w vr30, vr0, 12 // t11 // vr11 vr12 ... vr18 vsadd.h vr28, vr14, vr31 // c[3] vssub.h vr29, vr14, vr31 // c[12] vsadd.h vr20, vr15, vr30 // c[4] vssub.h vr21, vr15, vr30 // c[11] vsadd.h vr14, vr16, vr27 // c[5] vssub.h vr23, vr16, vr27 // c[10] vsadd.h vr15, vr17, vr9 // c[6] vssub.h vr30, vr17, vr9 // c[9] vsadd.h vr16, vr18, vr25 // c[7] vssub.h vr27, vr18, vr25 // c[8] vsadd.h vr17, vr13, vr26 // c[2] vssub.h vr26, vr13, vr26 // c[13] vsadd.h vr18, vr12, vr24 // c[1] vssub.h vr25, vr12, vr24 // c[14] vsadd.h vr22, vr11, vr10 // c[0] vssub.h vr24, vr11, vr10 // c[15] .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 4 .endr alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc const iadst16_coeffs, align=4 .word 4091, 201, 3973, 995 .word 3703, 1751, 3290, 2440 .word 2751, 3035, 2106, 3513 .word 1380, 3857, 601, 4052 endconst .macro adst16_core_lsx transpose8x8, shift, vst la.local t0, iadst16_coeffs vldrepl.w vr20, t0, 0 // 4091 vldrepl.w vr21, t0, 4 // 201 vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18 vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19 vssrarni.h.w vr18, vr16, 12 // t0 vssrarni.h.w vr19, vr17, 12 // t1 vldrepl.w vr20, t0, 8 // 3973 vldrepl.w vr21, t0, 12 // 995 vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0 vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15 vssrarni.h.w vr0, vr16, 12 // t2 vssrarni.h.w vr15, vr17, 12 // t3 vldrepl.w vr20, t0, 16 // 3703 vldrepl.w vr21, t0, 20 // 1751 vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2 vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13 vssrarni.h.w vr2, vr16, 12 // t4 vssrarni.h.w vr13, vr17, 12 // t5 vldrepl.w vr20, t0, 24 // 3290 -> 1645 vldrepl.w vr21, t0, 28 // 2440 -> 1220 vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4 vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11 vssrarni.h.w vr4, vr16, 12 // t6 vssrarni.h.w vr11, vr17, 12 // t7 vldrepl.w vr20, t0, 32 // 2751 vldrepl.w vr21, t0, 36 // 3035 vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6 vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9 vssrarni.h.w vr6, vr16, 12 // t8 vssrarni.h.w vr9, vr17, 12 // t9 vldrepl.w vr20, t0, 40 // 2106 vldrepl.w vr21, t0, 44 // 3513 vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7 vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8 vssrarni.h.w vr7, vr16, 12 // t10 vssrarni.h.w vr8, vr17, 12 // t11 vldrepl.w vr20, t0, 48 // 1380 vldrepl.w vr21, t0, 52 // 3857 vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5 vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10 vssrarni.h.w vr5, vr16, 12 // t12 vssrarni.h.w vr10, vr17, 12 // t13 vldrepl.w vr20, t0, 56 // 601 vldrepl.w vr21, t0, 60 // 4052 vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3 vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12 vssrarni.h.w vr3, vr16, 12 // t14 vssrarni.h.w vr12, vr17, 12 // t15 vsadd.h vr1, vr18, vr6 // t0a vssub.h vr14, vr18, vr6 // t8a vsadd.h vr16, vr19, vr9 // t1a vssub.h vr17, vr19, vr9 // t9a vsadd.h vr6, vr0, vr7 // t2a vssub.h vr18, vr0, vr7 // t10a vsadd.h vr9, vr15, vr8 // t3a vssub.h vr19, vr15, vr8 // t11a vsadd.h vr0, vr2, vr5 // t4a vssub.h vr7, vr2, vr5 // t12a vsadd.h vr8, vr13, vr10 // t5a vssub.h vr15, vr13, vr10 // t13a vsadd.h vr2, vr4, vr3 // t6a vssub.h vr5, vr4, vr3 // t14a vsadd.h vr10, vr11, vr12 // t7a vssub.h vr13, vr11, vr12 // t15a la.local t0, idct_coeffs vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11 vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12 vssrarni.h.w vr11, vr3, 12 // t8 vssrarni.h.w vr12, vr4, 12 // t9 vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14 vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17 vssrarni.h.w vr14, vr3, 12 // t13 vssrarni.h.w vr17, vr4, 12 // t12 vldrepl.w vr20, t0, 24 // 3406 vldrepl.w vr21, t0, 28 // 2276 vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7 vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15 vssrarni.h.w vr7, vr3, 12 // t10 vssrarni.h.w vr15, vr4, 12 // t11 vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18 vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19 vssrarni.h.w vr18, vr3, 12 // t15 vssrarni.h.w vr19, vr4, 12 // t14 vsadd.h vr5, vr1, vr0 // t0 vssub.h vr13, vr1, vr0 // t4 vsadd.h vr3, vr16, vr8 // t1 vssub.h vr4, vr16, vr8 // t5 vsadd.h vr0, vr6, vr2 // t2 vssub.h vr1, vr6, vr2 // t6 vsadd.h vr8, vr9, vr10 // t3 vssub.h vr16, vr9, vr10 // t7 vsadd.h vr2, vr11, vr17 // t8a vssub.h vr6, vr11, vr17 // t12a vsadd.h vr9, vr12, vr14 // t9a vssub.h vr10, vr12, vr14 // t13a vsadd.h vr11, vr7, vr19 // t10a vssub.h vr17, vr7, vr19 // t14a vsadd.h vr12, vr15, vr18 // t11a vssub.h vr14, vr15, vr18 // t15a la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18 vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19 vssrarni.h.w vr18, vr7, 12 // t4a vssrarni.h.w vr19, vr15, 12 // t5a vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4 vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13 vssrarni.h.w vr4, vr7, 12 // t7a vssrarni.h.w vr13, vr15, 12 // t6a vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1 vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16 vssrarni.h.w vr1, vr7, 12 // t12 vssrarni.h.w vr16, vr15, 12 // t13 vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6 vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10 vssrarni.h.w vr6, vr7, 12 // t15 vssrarni.h.w vr10, vr15, 12 // t14 vsadd.h vr14, vr5, vr0 // out[0] vssub.h vr17, vr5, vr0 // t2a vssub.h vr7, vr3, vr8 // t3a vsadd.h vr15, vr3, vr8 // out[15] vsllwil.w.h vr22, vr15, 0 vexth.w.h vr15, vr15 vneg.w vr22, vr22 vneg.w vr15, vr15 vssrarni.h.w vr15, vr22, 0 // out[15] vsadd.h vr14, vr5, vr0 // out[0] vssub.h vr17, vr5, vr0 // t2a vssub.h vr7, vr3, vr8 // t3a vsadd.h vr3, vr19, vr4 // out[12] vssub.h vr8, vr19, vr4 // t7 vssub.h vr0, vr18, vr13 // t6 vsadd.h vr5, vr18, vr13 // out[3] vsllwil.w.h vr22, vr5, 0 vexth.w.h vr5, vr5 vneg.w vr22, vr22 vneg.w vr5, vr5 vssrarni.h.w vr5, vr22, 0 // out[3] vsadd.h vr13, vr9, vr12 // out[14] vssub.h vr19, vr9, vr12 // t11 vssub.h vr4, vr2, vr11 // t10 vsadd.h vr18, vr2, vr11 // out[1] vsllwil.w.h vr22, vr18, 0 vexth.w.h vr18, vr18 vneg.w vr22, vr22 vneg.w vr18, vr18 vssrarni.h.w vr18, vr22, 0 // out[1] vsadd.h vr2, vr1, vr10 // out[2] vssub.h vr11, vr1, vr10 // t14a vssub.h vr12, vr16, vr6 // t15a vsadd.h vr9, vr16, vr6 // out[13] vsllwil.w.h vr22, vr9, 0 vexth.w.h vr9, vr9 vneg.w vr22, vr22 vneg.w vr9, vr9 vssrarni.h.w vr9, vr22, 0 // out[13] vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10 vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1 vssrarni.h.w vr10, vr6, 12 // out[7] vsllwil.w.h vr7, vr10, 0 vexth.w.h vr10, vr10 vneg.w vr7, vr7 vneg.w vr10, vr10 vssrarni.h.w vr10, vr7, 0 vssrarni.h.w vr1, vr16, 12 // out[8] vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17 vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7 vssrarni.h.w vr17, vr16, 12 // out[11] vsllwil.w.h vr0, vr17, 0 vexth.w.h vr17, vr17 vneg.w vr0, vr0 vneg.w vr17, vr17 vssrarni.h.w vr17, vr0, 0 vssrarni.h.w vr7, vr6, 12 // out[4] vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0 vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8 vssrarni.h.w vr0, vr16, 12 // out[9] vsllwil.w.h vr4, vr0, 0 vexth.w.h vr0, vr0 vneg.w vr4, vr4 vneg.w vr0, vr0 vssrarni.h.w vr0, vr4, 0 vssrarni.h.w vr8, vr6, 12 // out[6] vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4 vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19 vssrarni.h.w vr4, vr6, 12 // out[5] vsllwil.w.h vr24, vr4, 0 vexth.w.h vr4, vr4 vneg.w vr24, vr24 vneg.w vr4, vr4 vssrarni.h.w vr4, vr24, 0 vssrarni.h.w vr19, vr16, 12 // out[10] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 vsrari.h \i, \i, \shift .endr .endif .ifnb \vst vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 .endif // out0 out1 out2 out3 out4 out5 out6 out7 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 // out8 out9 out10 out11 out12 out13 out14 out15 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 .endm // adst16_core_lsx .macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7 fld.d f20, t2, 0 fldx.d f21, t2, a1 fld.d f22, t3, 0 fldx.d f23, t3, a1 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 fld.d f24, t2, 0 fldx.d f25, t2, a1 fld.d f26, t3, 0 fldx.d f27, t3, a1 .irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 vsllwil.hu.bu \i, \i, 0 .endr .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vsrari.h \i, \i, 4 .endr vadd.h vr20, vr20, \in0 vadd.h vr21, vr21, \in1 vadd.h vr22, vr22, \in2 vadd.h vr23, vr23, \in3 vadd.h vr24, vr24, \in4 vadd.h vr25, vr25, \in5 vadd.h vr26, vr26, \in6 vadd.h vr27, vr27, \in7 vssrani.bu.h vr21, vr20, 0 vssrani.bu.h vr23, vr22, 0 vssrani.bu.h vr25, vr24, 0 vssrani.bu.h vr27, vr26, 0 vstelm.d vr21, t4, 0, 0 vstelm.d vr21, t5, 0, 1 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 vstelm.d vr23, t4, 0, 0 vstelm.d vr23, t5, 0, 1 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 vstelm.d vr25, t4, 0, 0 vstelm.d vr25, t5, 0, 1 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 vstelm.d vr27, t4, 0, 0 vstelm.d vr27, t5, 0, 1 .endm // adst16_core_finish_lsx function inv_txfm_add_dct_adst_8x16_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vsrari.h \i, \i, 1 .endr vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31 adst16_core_lsx , , addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .macro malloc_space number li.w t0, \number sub.d sp, sp, t0 addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 .endm .macro free_space number fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 li.w t0, \number add.d sp, sp, t0 addi.d sp, sp, 64 .endm .macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 vsllwil.hu.bu vr10, \in0, 0 vexth.hu.bu vr0, \in0 vsllwil.hu.bu vr11, \in1, 0 vexth.hu.bu vr1, \in1 vsllwil.hu.bu vr12, \in2, 0 vexth.hu.bu vr2, \in2 vsllwil.hu.bu vr13, \in3, 0 vexth.hu.bu vr3, \in3 vadd.h vr10, vr10, \in4 vadd.h vr0, vr0, \in5 vadd.h vr11, vr11, \in6 vadd.h vr1, vr1, \in7 vadd.h vr12, vr12, \in8 vadd.h vr2, vr2, \in9 vadd.h vr13, vr13, \in10 vadd.h vr3, vr3, \in11 vssrani.bu.h vr0, vr10, 0 vssrani.bu.h vr1, vr11, 0 vssrani.bu.h vr2, vr12, 0 vssrani.bu.h vr3, vr13, 0 vst vr0, a0, 0 vstx vr1, a0, a1 vst vr2, t2, 0 vstx vr3, t2, a1 .endm .macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift .ifnb \shift .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vsrari.h \i, \i, \shift .endr .endif vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ \in4, \in5, \in6, \in7 .endm function inv_txfm_add_dct_dct_16x8_8bpc_lsx bnez a3, .NO_HAS_DCONLY_16x8 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 alsl.d t2, a1, a0, 1 vmul.w vr2, vr2, vr0 vldx vr1, a0, a1 vsrari.w vr2, vr2, 8 vldx vr3, t2, a1 vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift vmadd.w vr5, vr2, vr0 vld vr0, a0, 0 vssrarni.h.w vr5, vr5, 12 vld vr2, t2, 0 DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, b .DCT_DCT_16x8_END .NO_HAS_DCONLY_16x8: malloc_space 512 vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 la.local t0, idct_coeffs vldrepl.w vr23, t0, 0 //2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 rect2_lsx \i, vr23, \i .endr dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 1 .endr vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2 dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4 free_space 512 .DCT_DCT_16x8_END: endfunc function inv_txfm_add_adst_dct_16x8_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 addi.d t1, sp, 64 addi.d t2, a2, 0 vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 la.local t0, idct_coeffs vldrepl.w vr23, t0, 0 //2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr adst16_core_lsx , 1, // out0 out1 out2 out3 out4 out5 out6 out7 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 // out8 out9 out10 out11 out12 out13 out14 out15 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 vst vr23, a2, \i .endr dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2 dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc function inv_txfm_add_dct_dct_16x16_8bpc_lsx bnez a3, .NO_HAS_DCONLY_16x16 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 alsl.d t2, a1, a0, 1 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift vldx vr1, a0, a1 vmadd.w vr5, vr2, vr0 vldx vr3, t2, a1 vssrarni.h.w vr5, vr5, 12 vld vr0, a0, 0 vld vr2, t2, 0 DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, b .DCT_DCT_16x16_END .NO_HAS_DCONLY_16x16: malloc_space 512 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr31, a2, \i .endr vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx alsl.d t2, a1, a0, 1 vld vr4, sp, 64 vld vr5, sp, 80 vld vr6, sp, 96 vld vr7, sp, 112 VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 128 vld vr5, sp, 144 vld vr6, sp, 160 vld vr7, sp, 176 VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 320 vld vr5, sp, 336 vld vr6, sp, 352 vld vr7, sp, 368 VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 384 vld vr5, sp, 400 vld vr6, sp, 416 vld vr7, sp, 432 VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 free_space 512 .DCT_DCT_16x16_END: endfunc function inv_txfm_add_adst_adst_16x16_8bpc_lsx malloc_space 256+256 addi.d t1, sp, 64 addi.d t2, a2, 0 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 addi.d t2, a2, 16 addi.d t1, t1, 256 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr23, a2, \i .endr addi.d t2, sp, 64 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , // out0 out1 out2 out3 out4 out5 out6 out7 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 // out8 out9 out10 out11 out12 out13 out14 out15 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 addi.d t2, sp, 64+128 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , addi.d a0, a0, 8 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 free_space 256+256 endfunc function inv_txfm_add_adst_dct_16x16_8bpc_lsx malloc_space 256+256 addi.d t1, sp, 64 addi.d t2, a2, 0 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 addi.d t2, a2, 16 addi.d t1, t1, 256 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr23, a2, \i .endr addi.d t2, sp, 64 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 addi.d t2, sp, 64+128 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx alsl.d t2, a1, a0, 1 vld vr4, sp, 64 vld vr5, sp, 80 vld vr6, sp, 96 vld vr7, sp, 112 VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 128 vld vr5, sp, 144 vld vr6, sp, 160 vld vr7, sp, 176 VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 320 vld vr5, sp, 336 vld vr6, sp, 352 vld vr7, sp, 368 VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 384 vld vr5, sp, 400 vld vr6, sp, 416 vld vr7, sp, 432 VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 free_space 256+256 endfunc function inv_txfm_add_dct_adst_16x16_8bpc_lsx malloc_space 256+256 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr31, a2, \i .endr addi.d t2, sp, 64 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , // out0 out1 out2 out3 out4 out5 out6 out7 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 // out8 out9 out10 out11 out12 out13 out14 out15 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 addi.d t2, sp, 64+128 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , addi.d a0, a0, 8 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 free_space 256+256 endfunc const shufb .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 endconst function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx malloc_space 256+256 addi.d t1, sp, 64 addi.d t2, a2, 0 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 addi.d t2, a2, 16 addi.d t1, t1, 256 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx transpose8x8, 2, vst_x16 vreplgr2vr.h vr23, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr23, a2, \i .endr addi.d t2, sp, 64 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx la.local t0, shufb vld vr0, t0, 0 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vshuf.b \i, \i, \i, vr0 .endr vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 addi.d t2, sp, 64+128 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx la.local t0, shufb vld vr0, t0, 0 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vshuf.b \i, \i, \i, vr0 .endr alsl.d t2, a1, a0, 1 vld vr4, sp, 64 vld vr5, sp, 80 vld vr6, sp, 96 vld vr7, sp, 112 VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 128 vld vr5, sp, 144 vld vr6, sp, 160 vld vr7, sp, 176 VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 320 vld vr5, sp, 336 vld vr6, sp, 352 vld vr7, sp, 368 VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 vld vr4, sp, 384 vld vr5, sp, 400 vld vr6, sp, 416 vld vr7, sp, 432 VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4 free_space 256+256 endfunc function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx malloc_space 256+256 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vsrari.h \i, \i, 2 .endr vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr31, a2, \i .endr addi.d t2, sp, 64 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , // out0 out1 out2 out3 out4 out5 out6 out7 // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 // out8 out9 out10 out11 out12 out13 out14 out15 // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 la.local t0, shufb vld vr31, t0, 0 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 addi.d t2, sp, 64+128 vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 adst16_core_lsx , , addi.d a0, a0, 8 la.local t0, shufb vld vr31, t0, 0 addi.d t2, a0, 0 alsl.d t3, a1, a0, 1 addi.d t4, a0, 0 add.d t5, a1, a0 adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 alsl.d t2, a1, t2, 2 alsl.d t3, a1, t3, 2 alsl.d t4, a1, t4, 1 alsl.d t5, a1, t5, 1 adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 free_space 256+256 endfunc function inv_txfm_add_dct_dct_8x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x32 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr5, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 alsl.d t2, a1, a0, 1 vmadd.w vr5, vr2, vr0 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 vssrarni.h.w vr5, vr5, 12 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 .rept 7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 .endr b .DCT_DCT_8X32_END .NO_HAS_DCONLY_8x32: malloc_space 512 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsrari.h \i, \i, 2 .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsrari.h \i, \i, 2 .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsrari.h \i, \i, 2 .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 .irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsrari.h \i, \i, 2 .endr LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ 464, 480, 496 vst vr31, a2, \i .endr addi.d t2, sp, 64 addi.d t3, sp, 64 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 // in1 in3 in5 in7 in9 in11 in13 in15 // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 // in17 in19 in21 in23 in25 in27 in29 in31 la.local t0, idct_coeffs vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 vssrarni.h.w vr9, vr8, 12 // t31a vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 vssrarni.h.w vr10, vr11, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0 vssrarni.h.w vr0, vr11, 12 // t30a vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 vssrarni.h.w vr30, vr11, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 vssrarni.h.w vr7, vr8, 12 // t29a vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19 vssrarni.h.w vr19, vr8, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 vssrarni.h.w vr4, vr8, 12 // t28a vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26 vssrarni.h.w vr26, vr8, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 vssrarni.h.w vr3, vr8, 12 // t27a vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27 vssrarni.h.w vr27, vr8, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 vssrarni.h.w vr2, vr8, 12 // t26a vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28 vssrarni.h.w vr28, vr8, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 vssrarni.h.w vr5, vr8, 12 // t25a vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25 vssrarni.h.w vr25, vr8, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 vssrarni.h.w vr6, vr8, 12 // t24a vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24 vssrarni.h.w vr24, vr8, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vssrarni.h.w vr7, vr4, 12 // t30a vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0 vssrarni.h.w vr0, vr4, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vssrarni.h.w vr9, vr4, 12 // t18a vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2 vssrarni.h.w vr2, vr4, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vssrarni.h.w vr29, vr4, 12 // t26a vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6 vssrarni.h.w vr6, vr4, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vssrarni.h.w vr8, vr4, 12 // t22a vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24 vssrarni.h.w vr24, vr4, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vssrarni.h.w vr5, vr3, 12 // t29a vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2 vssrarni.h.w vr2, vr3, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vssrarni.h.w vr7, vr3, 12 // t28 vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24 vssrarni.h.w vr24, vr3, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vssrarni.h.w vr28, vr3, 12 // t20 vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25 vssrarni.h.w vr25, vr3, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vssrarni.h.w vr30, vr3, 12 // t21a vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1 vssrarni.h.w vr1, vr3, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vssrarni.h.w vr5, vr1, 12 // t20 vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7 vssrarni.h.w vr7, vr1, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vssrarni.h.w vr25, vr1, 12 // t21a vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6 vssrarni.h.w vr6, vr1, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vssrarni.h.w vr19, vr1, 12 // t22 vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10 vssrarni.h.w vr10, vr1, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vssrarni.h.w vr31, vr1, 12 // t23a vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8 vssrarni.h.w vr8, vr1, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, 4 .endr vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, 4 .endr vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 alsl.d t2, a1, a0, 1 addi.d t3, sp, 64 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, 64+64 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, 64+256 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, t3, 64 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, 64+384 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, t3, 64 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, 64+128 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, t3, 64 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 free_space 512 .DCT_DCT_8X32_END: endfunc .macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \ vst_start3, transpose8x8, shift // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 // in1 in3 in5 in7 in9 in11 in13 in15 // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 // in17 in19 in21 in23 in25 in27 in29 in31 la.local t0, idct_coeffs vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr11, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr11, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr11, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr11, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr11, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr11, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr11, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr11, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm function inv_txfm_add_dct_dct_32x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x32 ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 add.d t0, a0, a1 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr3, t0, 16 vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift vld vr1, a0, 16 vmadd.w vr20, vr2, vr0 vld vr2, t0, 0 vssrarni.h.w vr20, vr20, 12 vld vr0, a0, 0 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vexth.hu.bu vr2, vr2 vexth.hu.bu vr3, vr3 vadd.h vr8, vr4, vr20 vadd.h vr9, vr0, vr20 vadd.h vr10, vr5, vr20 vadd.h vr11, vr1, vr20 vadd.h vr12, vr6, vr20 vadd.h vr13, vr2, vr20 vadd.h vr14, vr7, vr20 vadd.h vr15, vr3, vr20 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vssrani.bu.h vr15, vr14, 0 vst vr9, a0, 0 vst vr11, a0, 16 vst vr13, t0, 0 vst vr15, t0, 16 .rept 15 alsl.d a0, a1, a0, 1 add.d t0, a0, a1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, t0, 0 vld vr3, t0, 16 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vexth.hu.bu vr2, vr2 vexth.hu.bu vr3, vr3 vadd.h vr8, vr4, vr20 vadd.h vr9, vr0, vr20 vadd.h vr10, vr5, vr20 vadd.h vr11, vr1, vr20 vadd.h vr12, vr6, vr20 vadd.h vr13, vr2, vr20 vadd.h vr14, vr7, vr20 vadd.h vr15, vr3, vr20 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vssrani.bu.h vr15, vr14, 0 vst vr9, a0, 0 vst vr11, a0, 16 vst vr13, t0, 0 vst vr15, t0, 16 .endr b .DCT_DCT_32X32_END .NO_HAS_DCONLY_32x32: malloc_space 2560 // 32*32*2+512 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 1024 addi.d t3, t3, 1024 addi.d t3, t3, 64 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 .rept 3 addi.d t2, t2, 16 addi.d t1, t1, 512 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 .endr vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 vst vr31, a2, \i .endr addi.d t2, sp, 64 addi.d t1, sp, 64 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 .rept 3 addi.d t2, t2, 16 addi.d t1, t1, 16 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x16_core_lsx vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 .endr addi.d t2, sp, 64 .rept 16 add.d t0, a0, a1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, t0, 0 vld vr3, t0, 16 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vexth.hu.bu vr2, vr2 vexth.hu.bu vr3, vr3 vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vadd.h vr8, vr4, vr8 vadd.h vr9, vr0, vr9 vadd.h vr10, vr5, vr10 vadd.h vr11, vr1, vr11 vadd.h vr12, vr6, vr12 vadd.h vr13, vr2, vr13 vadd.h vr14, vr7, vr14 vadd.h vr15, vr3, vr15 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vssrani.bu.h vr15, vr14, 0 vst vr9, a0, 0 vst vr11, a0, 16 vst vr13, t0, 0 vst vr15, t0, 16 alsl.d a0, a1, a0, 1 addi.d t2, t2, 128 .endr free_space 2560 // 32*32*2+512 .DCT_DCT_32X32_END: endfunc .macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7 // in0 in1 in2 in3 // dct4 in0 in2 la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vsllwil.w.h vr22, \in2, 0 vexth.w.h vr23, \in2 vmul.w vr8, vr22, vr20 vmul.w vr10, vr23, vr20 vmul.w \in2, vr22, vr21 vmul.w vr9, vr23, vr21 vssrarni.h.w vr10, vr8, 12 // t2 vssrarni.h.w vr9, \in2, 12 // t3 vldrepl.w vr20, t0, 0 // 2896 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w vr8, vr22, vr20 vmul.w \in2, vr23, vr20 vssrarni.h.w \in2, vr8, 12 vsadd.h vr8, \in2, vr9 // c[0] vssub.h vr9, \in2, vr9 // c[3] vsadd.h \in0, \in2, vr10 // c[1] vssub.h vr10, \in2, vr10 // c[2] // inv_dct8_1d_internal_c tx64 // in1 in3 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmul.w \in2, vr22, vr21 vmul.w \in4, vr23, vr21 vmul.w \in1, vr22, vr20 vmul.w \in6, vr23, vr20 vssrarni.h.w \in4, \in2, 12 // t7a vssrarni.h.w \in6, \in1, 12 // t4a vldrepl.w vr20, t0, 24 // 3406 vldrepl.w vr21, t0, 28 // 2276 vsllwil.w.h vr22, \in3, 0 vexth.w.h vr23, \in3 vneg.w vr21, vr21 vmul.w \in2, vr22, vr20 vmul.w \in1, vr23, vr20 vmul.w \in3, vr22, vr21 vmul.w \in7, vr23, vr21 vssrarni.h.w \in1, \in2, 12 // t6a vssrarni.h.w \in7, \in3, 12 // t5a vsadd.h \in3, \in6, \in7 // t4 vssub.h \in6, \in6, \in7 // t5a vsadd.h \in5, \in4, \in1 // t7 vssub.h \in4, \in4, \in1 // t6a vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 vssrarni.h.w \in1, vr21, 12 // t6 vssrarni.h.w \in7, \in2, 12 // t5 vsadd.h \out0, vr8, \in5 // c[0] vssub.h \out7, vr8, \in5 // c[7] vsadd.h \out1, \in0, \in1 // c[1] vssub.h \out6, \in0, \in1 // c[6] vsadd.h \out2, vr10, \in7 // c[2] vssub.h \out5, vr10, \in7 // c[5] vsadd.h \out3, vr9, \in3 // c[3] vssub.h \out4, vr9, \in3 // c[4] .endm .macro dct_8x16_tx64_core_lsx dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18 // in1 in3 in5 in7 in9 in11 in13 in15 // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 la.local t0, idct_coeffs vldrepl.w vr20, t0, 32 // 401 vldrepl.w vr21, t0, 36 // 4076 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vmul.w vr0, vr22, vr21 vmul.w vr10, vr23, vr21 vmul.w vr1, vr22, vr20 vmul.w vr29, vr23, vr20 vssrarni.h.w vr10, vr0, 12 // t15a vssrarni.h.w vr29, vr1, 12 // t8a vldrepl.w vr20, t0, 40 // 3166 -> 1583 vldrepl.w vr21, t0, 44 // 2598 -> 1299 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr30, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr31, vr23, vr21 vssrarni.h.w vr30, vr0, 12 // t14a vssrarni.h.w vr31, vr7, 12 // t9a vldrepl.w vr20, t0, 48 // 1931 vldrepl.w vr21, t0, 52 // 3612 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vmul.w vr0, vr22, vr21 vmul.w vr24, vr23, vr21 vmul.w vr5, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr24, vr0, 12 // t13a vssrarni.h.w vr25, vr5, 12 // t10a vldrepl.w vr20, t0, 56 // 3920 vldrepl.w vr21, t0, 60 // 1189 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr26, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr27, vr23, vr21 vssrarni.h.w vr26, vr0, 12 // t12a vssrarni.h.w vr27, vr3, 12 // t11a // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 vsadd.h vr28, vr29, vr31 // t8 vssub.h vr19, vr29, vr31 // t9 vssub.h vr29, vr27, vr25 // t10 vsadd.h vr9, vr27, vr25 // t11 vsadd.h vr31, vr26, vr24 // t12 vssub.h vr25, vr26, vr24 // t13 vssub.h vr27, vr10, vr30 // t14 vsadd.h vr24, vr10, vr30 // t15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 vssrarni.h.w vr26, vr0, 12 // t14a vssrarni.h.w vr30, vr1, 12 // t9a vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 vneg.w vr0, vr0 vneg.w vr19, vr19 vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 vssrarni.h.w vr19, vr0, 12 // t10a vssrarni.h.w vr27, vr1, 12 // t13a vsadd.h vr25, vr28, vr9 // t8a vssub.h vr29, vr28, vr9 // t11a vssub.h vr28, vr24, vr31 // t12a vsadd.h vr10, vr24, vr31 // t15a vsadd.h vr9, vr30, vr19 // t9 vssub.h vr31, vr30, vr19 // t10 vssub.h vr30, vr26, vr27 // t13 vsadd.h vr24, vr26, vr27 // t14 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 vssrarni.h.w vr26, vr0, 12 // t13a vssrarni.h.w vr27, vr1, 12 // t10a vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 vssrarni.h.w vr31, vr0, 12 // t12 vssrarni.h.w vr30, vr1, 12 // t11 // vr11 vr12 ... vr18 vsadd.h vr28, vr14, vr31 // c[3] vssub.h vr29, vr14, vr31 // c[12] vsadd.h vr20, vr15, vr30 // c[4] vssub.h vr21, vr15, vr30 // c[11] vsadd.h vr14, vr16, vr27 // c[5] vssub.h vr23, vr16, vr27 // c[10] vsadd.h vr15, vr17, vr9 // c[6] vssub.h vr30, vr17, vr9 // c[9] vsadd.h vr16, vr18, vr25 // c[7] vssub.h vr27, vr18, vr25 // c[8] vsadd.h vr17, vr13, vr26 // c[2] vssub.h vr26, vr13, vr26 // c[13] vsadd.h vr18, vr12, vr24 // c[1] vssub.h vr25, vr12, vr24 // c[14] vsadd.h vr22, vr11, vr10 // c[0] vssub.h vr24, vr11, vr10 // c[15] .endm // dct_8x16_tx64_core_lsx .macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \tmp0, vr22, \in1 vmul.w \out0, vr23, \in1 vmul.w \tmp1, vr22, \in2 vmul.w \out1, vr23, \in2 vssrarni.h.w \out0, \tmp0, 12 vssrarni.h.w \out1, \tmp1, 12 .endm const idct64_coeffs, align=4 .word 101, 4095, 2967, -2824 .word 1660, 3745, 3822, -1474 .word 4076, 401, 4017, 799 .word 4036, -700, 2359, 3349 .word 3461, -2191, 897, 3996 .word -3166, -2598, -799, -4017 .word 501, 4065, 3229, -2520 .word 2019, 3564, 3948, -1092 .word 3612, 1931, 2276, 3406 .word 4085, -301, 2675, 3102 .word 3659, -1842, 1285, 3889 .word -3920, -1189, -3406, -2276 endconst // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a .macro dct64_step1_lsx vldrepl.w vr20, t0, 0 // 101 vldrepl.w vr21, t0, 4 // 4095 vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a vldrepl.w vr20, t0, 8 // 2967 vldrepl.w vr21, t0, 12 // -2824 vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a vldrepl.w vr20, t0, 16 // 1660 vldrepl.w vr21, t0, 20 // 3745 vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a vldrepl.w vr20, t0, 24 // 3822 vldrepl.w vr21, t0, 28 // -1474 vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a vsadd.h vr0, vr8, vr11 // t32 vssub.h vr1, vr8, vr11 // t33 vssub.h vr2, vr15, vr12 // t34 vsadd.h vr3, vr15, vr12 // t35 vsadd.h vr4, vr14, vr13 // t60 vssub.h vr5, vr14, vr13 // t61 vssub.h vr6, vr9, vr10 // t62 vsadd.h vr7, vr9, vr10 // t63 vldrepl.w vr20, t0, 32 // 4076 vldrepl.w vr21, t0, 36 // 401 vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 vssrarni.h.w vr10, vr9, 12 // t62a vssrarni.h.w vr11, vr13, 12 // t33a vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 vneg.w vr9, vr9 vneg.w vr1, vr1 vssrarni.h.w vr6, vr13, 12 // t61a vssrarni.h.w vr1, vr9, 12 // t34a vsadd.h vr2, vr0, vr3 // t32a vssub.h vr5, vr0, vr3 // t35a vsadd.h vr9, vr11, vr1 // t33 vssub.h vr13, vr11, vr1 // t34 vssub.h vr0, vr7, vr4 // t60a vsadd.h vr3, vr7, vr4 // t63a vssub.h vr1, vr10, vr6 // t61 vsadd.h vr11, vr10, vr6 // t62 vldrepl.w vr20, t0, 40 // 4017 vldrepl.w vr21, t0, 44 // 799 vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 vssrarni.h.w vr4, vr8, 12 // t61a vssrarni.h.w vr7, vr12, 12 // t34a vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 vssrarni.h.w vr6, vr8, 12 // t60 vssrarni.h.w vr10, vr12, 12 // t35 vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 .endm // dct64_step1 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a .macro dct64_step2_lsx vld vr0, t5, 0 // t32a vld vr2, t4, 0 // t63a vld vr3, t5, 16*8 // t56a vld vr1, t4, 16*8 // t39a vld vr4, t5, 16*16 // t40a vld vr6, t4, 16*16 // t55a vld vr7, t5, 16*24 // t48a vld vr5, t4, 16*24 // t47a vsadd.h vr8, vr0, vr1 // t32 vssub.h vr9, vr0, vr1 // t39 vsadd.h vr10, vr2, vr3 // t63 vssub.h vr11, vr2, vr3 // t56 vssub.h vr12, vr5, vr4 // t40 vsadd.h vr13, vr5, vr4 // t47 vsadd.h vr14, vr7, vr6 // t48 vssub.h vr15, vr7, vr6 // t55 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 vssrarni.h.w vr2, vr0, 12 // t56a vssrarni.h.w vr3, vr1, 12 // t39a vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 vneg.w vr0, vr0 vneg.w vr4, vr4 vssrarni.h.w vr5, vr1, 12 // t55a vssrarni.h.w vr4, vr0, 12 // t40a vsadd.h vr9, vr8, vr13 // t32a vssub.h vr11, vr8, vr13 // t47a vsadd.h vr6, vr3, vr4 // t39 vssub.h vr7, vr3, vr4 // t40 vssub.h vr12, vr10, vr14 // t48a vsadd.h vr15, vr10, vr14 // t63a vssub.h vr0, vr2, vr5 // t55 vsadd.h vr1, vr2, vr5 // t56 vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 vssrarni.h.w vr13, vr8, 12 // t40a vssrarni.h.w vr4, vr3, 12 // t55a vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 vssrarni.h.w vr10, vr8, 12 // t47 vssrarni.h.w vr14, vr3, 12 // t48 // t32a t39 t40a t47 t48 t55a t56 t63a // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 vst vr9, t5, 0 // t32a vst vr6, t4, 0 // t39 vst vr13, t5, 16*8 // t40a vst vr10, t4, 16*8 // t47 vst vr14, t5, 16*16 // t48 vst vr4, t4, 16*16 // t55a vst vr1, t5, 16*24 // t56 vst vr15, t4, 16*24 // t63a .endm // dct64_step2_lsx .macro dct64_step3_lsx // t0 t1 t2 t3 t4 t5 t6 t7 vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 vld vr9, t5, 16*24 // t56 vld vr6, t5, 16*24+16 // t57a vld vr13, t5, 16*24+32 // t58 vld vr10, t5, 16*24+48 // t59a vld vr14, t4, 16*24-48 // t60 vld vr4, t4, 16*24-32 // t61a vld vr1, t4, 16*24-16 // t62 vld vr15, t4, 16*24 // t63a vsadd.h vr20, vr2, vr15 // c[0] vssub.h vr21, vr2, vr15 // c[63] vsadd.h vr22, vr3, vr1 // c[1] vssub.h vr23, vr3, vr1 // c[62] vsadd.h vr24, vr7, vr4 // c[2] vssub.h vr25, vr7, vr4 // c[61] vsadd.h vr26, vr8, vr14 // c[3] vssub.h vr27, vr8, vr14 // c[60] vsadd.h vr28, vr11, vr10 // c[4] vssub.h vr29, vr11, vr10 // c[59] vsadd.h vr30, vr12, vr13 // c[5] vssub.h vr31, vr12, vr13 // c[58] vsadd.h vr2, vr16, vr6 // c[6] vssub.h vr15, vr16, vr6 // c[57] vsadd.h vr1, vr17, vr9 // c[7] vssub.h vr3, vr17, vr9 // c[56] .endm // dct64_step3_lsx .macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 dct64_step3_lsx .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 .endif .ifnb \shift .irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 vsrari.h \i, \i, \shift .endr .endif vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm // dct64_step4_lsx .macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 fld.d f4, t0, 0 fldx.d f5, t0, a1 fld.d f6, t6, 0 fldx.d f7, t6, a1 alsl.d t0, a1, t0, 2 alsl.d t6, a1, t6, 2 fld.d f8, t0, 0 fldx.d f9, t0, a1 fld.d f10, t6, 0 fldx.d f11, t6, a1 .irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 vsllwil.hu.bu \i, \i, 0 .endr vsrari.h vr20, \in0, 4 vsrari.h vr22, \in1, 4 vsrari.h vr24, \in2, 4 vsrari.h vr26, \in3, 4 vsrari.h vr28, \in4, 4 vsrari.h vr30, \in5, 4 vsrari.h vr2, \in6, 4 vsrari.h vr1, \in7, 4 vadd.h vr4, vr4, vr20 vadd.h vr5, vr5, vr22 vadd.h vr6, vr6, vr24 vadd.h vr7, vr7, vr26 vadd.h vr8, vr8, vr28 vadd.h vr9, vr9, vr30 vadd.h vr10, vr10, vr2 vadd.h vr11, vr11, vr1 vssrani.bu.h vr5, vr4, 0 vssrani.bu.h vr7, vr6, 0 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vstelm.d vr5, t1, 0, 0 vstelm.d vr5, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr7, t1, 0, 0 vstelm.d vr7, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr9, t1, 0, 0 vstelm.d vr9, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr11, t1, 0, 0 vstelm.d vr11, t2, 0, 1 .endm // dct64_step5_lsx .macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1 vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x16_tx64_core_lsx vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 la.local t0, idct_coeffs vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vsllwil.w.h vr22, vr0, 0 vexth.w.h vr23, vr0 vmul.w vr8, vr22, vr21 vmul.w vr9, vr23, vr21 vmul.w vr0, vr22, vr20 vmul.w vr10, vr23, vr20 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr0, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr0, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr30, vr23, vr21 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr7, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vsllwil.w.h vr22, vr4, 0 vexth.w.h vr23, vr4 vmul.w vr8, vr22, vr21 vmul.w vr7, vr23, vr21 vmul.w vr4, vr22, vr20 vmul.w vr19, vr23, vr20 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr4, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr4, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr26, vr23, vr21 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr3, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vsllwil.w.h vr22, vr2, 0 vexth.w.h vr23, vr2 vmul.w vr8, vr22, vr21 vmul.w vr3, vr23, vr21 vmul.w vr2, vr22, vr20 vmul.w vr27, vr23, vr20 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr2, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr2, vr23, vr20 vmul.w vr5, vr22, vr21 vmul.w vr28, vr23, vr21 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr5, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vsllwil.w.h vr22, vr6, 0 vexth.w.h vr23, vr6 vmul.w vr8, vr22, vr21 vmul.w vr5, vr23, vr21 vmul.w vr6, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr6, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr6, vr23, vr20 vmul.w vr1, vr22, vr21 vmul.w vr24, vr23, vr21 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr1, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm // dct_8x32_tx64_new_lsx function inv_txfm_add_dct_dct_64x64_8bpc_lsx bnez a3, .NO_HAS_DCONLY_64x64 ld.h t2, a2, 0 vldi vr0, 0x8b5 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 vmul.w vr2, vr0, vr1 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 vld vr3, a0, 48 vsrari.w vr2, vr2, 2 vld vr1, a0, 16 vmadd.w vr20, vr2, vr0 vld vr2, a0, 32 vssrarni.h.w vr20, vr20, 12 vld vr0, a0, 0 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vexth.hu.bu vr2, vr2 vexth.hu.bu vr3, vr3 vadd.h vr8, vr4, vr20 vadd.h vr9, vr0, vr20 vadd.h vr10, vr5, vr20 vadd.h vr11, vr1, vr20 vadd.h vr12, vr6, vr20 vadd.h vr13, vr2, vr20 vadd.h vr14, vr7, vr20 vadd.h vr15, vr3, vr20 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vssrani.bu.h vr15, vr14, 0 vst vr9, a0, 0 vst vr11, a0, 16 vst vr13, a0, 32 vst vr15, a0, 48 .rept 63 add.d a0, a0, a1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vexth.hu.bu vr2, vr2 vexth.hu.bu vr3, vr3 vadd.h vr8, vr4, vr20 vadd.h vr9, vr0, vr20 vadd.h vr10, vr5, vr20 vadd.h vr11, vr1, vr20 vadd.h vr12, vr6, vr20 vadd.h vr13, vr2, vr20 vadd.h vr14, vr7, vr20 vadd.h vr15, vr3, vr20 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vssrani.bu.h vr13, vr12, 0 vssrani.bu.h vr15, vr14, 0 vst vr9, a0, 0 vst vr11, a0, 16 vst vr13, a0, 32 vst vr15, a0, 48 .endr b .DCT_DCT_64X64_END .NO_HAS_DCONLY_64x64: malloc_space 64*32*2+512+512 addi.d t7, sp, 64 .macro dct64x64_core1_lsx in0, in1, in2 addi.d t2, a2, \in0 addi.d t7, t7, \in1 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero dct_8x32_tx64_new_lsx 0, 256, 128, 256 la.local t0, idct64_coeffs addi.d t2, a2, \in2 // 32 ... // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a vld vr0, t2, 128*0 // in1 vld vr1, t2, 128*15 // in31 vld vr2, t2, 128*8 // in17 vld vr3, t2, 128*7 // in15 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a vld vr0, t2, 128*3 // in7 vld vr1, t2, 128*12 // in25 vld vr2, t2, 128*11 // in23 vld vr3, t2, 128*4 // in9 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a vld vr0, t2, 128*2 // in5 vld vr1, t2, 128*13 // in27 vld vr2, t2, 128*10 // in21 vld vr3, t2, 128*5 // in11 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld vr0, t2, 128*1 // in3 vld vr1, t2, 128*14 // in29 vld vr2, t2, 128*9 // in19 vld vr3, t2, 128*6 // in13 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128 .endm dct64x64_core1_lsx 0, 0, 64 dct64x64_core1_lsx 16, 128*8, 64+16 dct64x64_core1_lsx 32, 128*8, 64+16*2 dct64x64_core1_lsx 48, 128*8, 64+16*3 vreplgr2vr.h vr31, zero .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 vst vr31, a2, \i .endr .macro dct64x64_core2_lsx in0, in1 addi.d t2, sp, 64+\in0 addi.d t7, sp, 64+\in0 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero addi.d t2, t2, 1024 addi.d t2, t2, 1024 dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512 la.local t0, idct64_coeffs addi.d t2, sp, 64+64*2+\in0 addi.d t4, t2, 256*7 addi.d t4, t4, 256 vld vr0, t2, 256*0 // in1 vld vr1, t4, 256*7 // in31 vld vr2, t4, 256*0 // in17 vld vr3, t2, 256*7 // in15 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*3 // in7 vld vr1, t4, 256*4 // in25 vld vr2, t4, 256*3 // in23 vld vr3, t2, 256*4 // in9 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*2 // in5 vld vr1, t4, 256*5 // in27 vld vr2, t4, 256*2 // in21 vld vr3, t2, 256*5 // in11 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*1 // in3 vld vr1, t4, 256*6 // in29 vld vr2, t4, 256*1 // in19 vld vr3, t2, 256*6 // in13 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 addi.d a0, a0, \in1 // 0 - 7, 56 -63 dct64_step3_lsx li.w t8, 0 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 56 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 8 - 15, 48 - 55 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 8 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 48 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 16 - 23, 40 - 47 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 16 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 40 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 24 - 31, 32 - 39 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 24 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 32 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm dct64x64_core2_lsx 16*0, 0 dct64x64_core2_lsx 16*1, 8 dct64x64_core2_lsx 16*2, 8 dct64x64_core2_lsx 16*3, 8 dct64x64_core2_lsx 16*4, 8 dct64x64_core2_lsx 16*5, 8 dct64x64_core2_lsx 16*6, 8 dct64x64_core2_lsx 16*7, 8 free_space 64*32*2+512+512 .DCT_DCT_64X64_END: endfunc