From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/loongarch/cpu.c | 47 + third_party/dav1d/src/loongarch/cpu.h | 37 + third_party/dav1d/src/loongarch/itx.S | 8104 ++++++++++++++++++++ third_party/dav1d/src/loongarch/itx.h | 195 + third_party/dav1d/src/loongarch/loongson_asm.S | 776 ++ third_party/dav1d/src/loongarch/loopfilter.S | 1108 +++ third_party/dav1d/src/loongarch/loopfilter.h | 52 + third_party/dav1d/src/loongarch/looprestoration.S | 1407 ++++ third_party/dav1d/src/loongarch/looprestoration.h | 78 + .../dav1d/src/loongarch/looprestoration_tmpl.c | 274 + third_party/dav1d/src/loongarch/mc.S | 4758 ++++++++++++ third_party/dav1d/src/loongarch/mc.h | 118 + third_party/dav1d/src/loongarch/msac.S | 368 + third_party/dav1d/src/loongarch/msac.h | 46 + third_party/dav1d/src/loongarch/refmvs.S | 152 + third_party/dav1d/src/loongarch/refmvs.h | 44 + 16 files changed, 17564 insertions(+) create mode 100644 third_party/dav1d/src/loongarch/cpu.c create mode 100644 third_party/dav1d/src/loongarch/cpu.h create mode 100644 third_party/dav1d/src/loongarch/itx.S create mode 100644 third_party/dav1d/src/loongarch/itx.h create mode 100644 third_party/dav1d/src/loongarch/loongson_asm.S create mode 100644 third_party/dav1d/src/loongarch/loopfilter.S create mode 100644 third_party/dav1d/src/loongarch/loopfilter.h create mode 100644 third_party/dav1d/src/loongarch/looprestoration.S create mode 100644 third_party/dav1d/src/loongarch/looprestoration.h create mode 100644 third_party/dav1d/src/loongarch/looprestoration_tmpl.c create mode 100644 third_party/dav1d/src/loongarch/mc.S create mode 100644 third_party/dav1d/src/loongarch/mc.h create mode 100644 third_party/dav1d/src/loongarch/msac.S create mode 100644 third_party/dav1d/src/loongarch/msac.h create mode 100644 third_party/dav1d/src/loongarch/refmvs.S create mode 100644 third_party/dav1d/src/loongarch/refmvs.h (limited to 'third_party/dav1d/src/loongarch') diff --git a/third_party/dav1d/src/loongarch/cpu.c b/third_party/dav1d/src/loongarch/cpu.c new file mode 100644 index 0000000000..a79ade5472 --- /dev/null +++ b/third_party/dav1d/src/loongarch/cpu.c @@ -0,0 +1,47 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "common/attributes.h" +#include "src/loongarch/cpu.h" + +#if defined(HAVE_GETAUXVAL) +#include + +#define LA_HWCAP_LSX ( 1 << 4 ) +#define LA_HWCAP_LASX ( 1 << 5 ) +#endif + +COLD unsigned dav1d_get_cpu_flags_loongarch(void) { + unsigned flags = 0; +#if defined(HAVE_GETAUXVAL) + unsigned long hw_cap = getauxval(AT_HWCAP); + flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0; + flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0; +#endif + + return flags; +} diff --git a/third_party/dav1d/src/loongarch/cpu.h b/third_party/dav1d/src/loongarch/cpu.h new file mode 100644 index 0000000000..d00ff67dac --- /dev/null +++ b/third_party/dav1d/src/loongarch/cpu.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_CPU_H +#define DAV1D_SRC_LOONGARCH_CPU_H + +enum CpuFlags { + DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0, + DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1, +}; + +unsigned dav1d_get_cpu_flags_loongarch(void); + +#endif /* DAV1D_SRC_LOONGARCH_CPU_H */ diff --git a/third_party/dav1d/src/loongarch/itx.S b/third_party/dav1d/src/loongarch/itx.S new file mode 100644 index 0000000000..fc0c79ea01 --- /dev/null +++ b/third_party/dav1d/src/loongarch/itx.S @@ -0,0 +1,8104 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride, + coef *const coeff, const int eob + HIGHBD_DECL_SUFFIX) +*/ +function inv_txfm_add_wht_wht_4x4_8bpc_lsx + vld vr0, a2, 0 + vld vr2, a2, 16 + + vreplgr2vr.h vr20, zero + + vsrai.h vr0, vr0, 2 + vsrai.h vr2, vr2, 2 + + vst vr20, a2, 0 + + vpickod.d vr1, vr0, vr0 + vpickod.d vr3, vr2, vr2 + + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr2, vr3 + vsub.h vr6, vr4, vr5 + vsrai.h vr6, vr6, 1 + vsub.h vr0, vr6, vr3 + vsub.h vr2, vr6, vr1 + vsub.h vr1, vr4, vr0 + vadd.h vr3, vr5, vr2 + + vst vr20, a2, 16 + + vilvl.h vr4, vr0, vr1 + vilvl.h vr5, vr3, vr2 + vilvl.w vr0, vr5, vr4 + vilvh.w vr2, vr5, vr4 + vilvh.d vr1, vr0, vr0 + vilvh.d vr3, vr2, vr2 + + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr2, vr3 + vsub.h vr6, vr4, vr5 + vsrai.h vr6, vr6, 1 + vsub.h vr0, vr6, vr3 + vsub.h vr2, vr6, vr1 + vsub.h vr1, vr4, vr0 + vadd.h vr3, vr5, vr2 + + vld vr4, a0, 0 + vldx vr5, a0, a1 + alsl.d t0, a1, a0, 1 + vld vr6, t0, 0 + vldx vr7, t0, a1 + + vsllwil.hu.bu vr4, vr4, 0 + vsllwil.hu.bu vr5, vr5, 0 + vsllwil.hu.bu vr6, vr6, 0 + vsllwil.hu.bu vr7, vr7, 0 + vilvl.d vr1, vr0, vr1 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vadd.h vr1, vr1, vr4 + vadd.h vr2, vr2, vr6 + vssrani.bu.h vr2, vr1, 0 + + vstelm.w vr2, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 1 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 2 + add.d a0, a0, a1 + vstelm.w vr2, a0, 0, 3 +endfunc + +const idct_coeffs, align=4 + // idct4 + .word 2896, 2896*8, 1567, 3784 + // idct8 + .word 799, 4017, 3406, 2276 + // idct16 + .word 401, 4076, 3166, 2598 + .word 1931, 3612, 3920, 1189 + // idct32 + .word 201, 4091, 3035, 2751 + .word 1751, 3703, 3857, 1380 + .word 995, 3973, 3513, 2106 + .word 2440, 3290, 4052, 601 +endconst + +.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 + vld \in0, \src, \start + vld \in1, \src, \start+(\stride*1) + vld \in2, \src, \start+(\stride*2) + vld \in3, \src, \start+(\stride*3) + vld \in4, \src, \start+(\stride*4) + vld \in5, \src, \start+(\stride*5) + vld \in6, \src, \start+(\stride*6) + vld \in7, \src, \start+(\stride*7) +.endm + +.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 + vst \in0, \src, \start + vst \in1, \src, \start+(\stride*1) + vst \in2, \src, \start+(\stride*2) + vst \in3, \src, \start+(\stride*3) + vst \in4, \src, \start+(\stride*4) + vst \in5, \src, \start+(\stride*5) + vst \in6, \src, \start+(\stride*6) + vst \in7, \src, \start+(\stride*7) +.endm + +.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15 + + vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + + vld \in8, \src, \start+(\stride*8) + vld \in9, \src, \start+(\stride*9) + vld \in10, \src, \start+(\stride*10) + vld \in11, \src, \start+(\stride*11) + vld \in12, \src, \start+(\stride*12) + vld \in13, \src, \start+(\stride*13) + vld \in14, \src, \start+(\stride*14) + vld \in15, \src, \start+(\stride*15) +.endm + +.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15 + + vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + + vst \in8, \src, \start+(\stride*8) + vst \in9, \src, \start+(\stride*9) + vst \in10, \src, \start+(\stride*10) + vst \in11, \src, \start+(\stride*11) + vst \in12, \src, \start+(\stride*12) + vst \in13, \src, \start+(\stride*13) + vst \in14, \src, \start+(\stride*14) + vst \in15, \src, \start+(\stride*15) +.endm + +.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 + vilvl.w vr10, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... + vilvl.w vr12, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... + vsllwil.hu.bu vr10, vr10, 0 + vsllwil.hu.bu vr12, vr12, 0 + vadd.h vr10, \in4, vr10 + vadd.h vr12, \in5, vr12 + vssrani.bu.h vr12, vr10, 0 + vstelm.w vr12, a0, 0, 0 + add.d t8, a0, a1 + vstelm.w vr12, t8, 0, 1 + vstelm.w vr12, t2, 0, 2 + add.d t8, t2, a1 + vstelm.w vr12, t8, 0, 3 +.endm + +.macro VLD_DST_ADD_W4 in0, in1 + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + + DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 +.endm + +.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1 + vexth.w.h vr4, \in0 // in1 + vexth.w.h vr5, \in1 // in3 + vmul.w vr6, vr4, \in4 + vmul.w vr7, vr4, \in5 + vmadd.w vr6, vr5, \in5 // t3 + vmsub.w vr7, vr5, \in4 // t2 + vsllwil.w.h vr4, \in2, 0 // in0 + vsllwil.w.h vr5, \in3, 0 // in2 + vmul.w vr9, vr4, \in6 + vmul.w vr10, vr4, \in7 + vmadd.w vr9, vr5, \in7 // t0 + vmsub.w vr10, vr5, \in6 // t1 + vssrarni.h.w vr10, vr9, 12 // t0 t1 + vssrarni.h.w vr7, vr6, 12 // t3 t2 + vsadd.h \out0, vr10, vr7 // 0 4 8 12 1 5 9 13 c[0] c[1] + vssub.h \out1, vr10, vr7 // 3 7 11 15 2 6 10 14 c[3] c[2] +.endm + +.macro inv_dct_dct_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 + + vreplgr2vr.h vr15, zero + vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 + vst vr15, a2, 0 + vst vr15, a2, 16 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + vshuf4i.d vr14, vr14, 0x01 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro identity_4x4_lsx in0, in1, in2, in3, out0 + vsllwil.w.h vr2, \in0, 0 + vexth.w.h vr3, \in1 + vmul.w vr4, vr2, \in2 + vmul.w vr5, vr3, \in2 + vssrarni.h.w vr5, vr4, 12 + vsadd.h \out0, vr5, \in3 +.endm + +.macro inv_identity_identity_4x4_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + identity_4x4_lsx vr0, vr0, vr20, vr0, vr6 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr7 + + vsrari.h vr6, vr6, 4 + vsrari.h vr7, vr7, 4 + vilvh.d vr8, vr6, vr6 + vilvh.d vr9, vr7, vr7 + vilvl.h vr4, vr8, vr6 + vilvl.h vr5, vr9, vr7 + vilvl.w vr6, vr5, vr4 + vilvh.w vr7, vr5, vr4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr6, vr7 +.endm + +const iadst4_coeffs, align=4 + .word 1321, 3803, 2482, 3344 +endconst + +.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3 + vsub.w vr6, \in0, \in2 // in0-in2 + vmul.w vr7, \in0, vr20 // in0*1321 + vmadd.w vr7, \in2, vr21 // in0*1321+in2*3803 + vmadd.w vr7, \in3, vr22 // in0*1321+in2*3803+in3*2482 + vmul.w vr8, \in1, vr23 // in1*3344 + vadd.w vr6, vr6, \in3 // in0-in2+in3 + vmul.w vr9, \in0, vr22 // in0*2482 + vmsub.w vr9, \in2, vr20 // in2*1321 + vmsub.w vr9, \in3, vr21 // in0*2482-in2*1321-in3*3803 + vadd.w vr5, vr7, vr9 + vmul.w \out2, vr6, vr23 // out[2] 8 9 10 11 + vadd.w \out0, vr7, vr8 // out[0] 0 1 2 3 + vadd.w \out1, vr9, vr8 // out[1] 4 5 6 7 + vsub.w \out3, vr5, vr8 // out[3] 12 13 14 15 +.endm + +.macro inv_adst_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + + vreplgr2vr.h vr15, zero + la.local t0, idct_coeffs + vst vr15, a2, 0 + vst vr15, a2, 16 + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_adst_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + + vsrari.w vr11, vr11, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_dct_adst_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr12, 0 // in2 + vexth.w.h vr5, vr12 // in3 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_dct_flipadst_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + vshuf4i.d vr12, vr12, 0x01 // 3 7 11 15 2 6 10 14 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr12, 0 // in2 + vexth.w.h vr5, vr12 // in3 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 // 0 1 2 3 4 5 6 7 + vssrarni.h.w vr13, vr14, 12 // 8 9 10 11 12 13 14 15 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_flipadst_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vsrari.w vr0, vr0, 12 + vsrari.w vr1, vr1, 12 + vsrari.w vr2, vr2, 12 + vsrari.w vr3, vr3, 12 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_adst_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + vsrari.w vr11, vr11, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 + vssrarni.h.w vr13, vr14, 12 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_flipadst_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vssrarni.h.w vr12, vr11, 12 + vssrarni.h.w vr14, vr13, 12 + + vreplgr2vr.h vr15, zero + la.local t0, idct_coeffs + vst vr15, a2, 0 + vst vr15, a2, 16 + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_flipadst_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vilvl.w vr4, vr0, vr1 + vilvh.w vr5, vr0, vr1 + vilvl.w vr6, vr2, vr3 + vilvh.w vr7, vr2, vr3 + vilvl.d vr11, vr4, vr6 + vilvh.d vr12, vr4, vr6 + vilvl.d vr13, vr5, vr7 + vilvh.d vr14, vr5, vr7 + + vsrari.w vr11, vr11, 12 + vsrari.w vr12, vr12, 12 + vsrari.w vr13, vr13, 12 + vsrari.w vr14, vr14, 12 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14 + + vssrarni.h.w vr11, vr12, 12 + vssrarni.h.w vr13, vr14, 12 + vsrari.h vr11, vr11, 4 + vsrari.h vr13, vr13, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_dct_identity_4x4_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 + vld vr1, a2, 16 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12 + vshuf4i.d vr12, vr12, 0x01 // 2 6 10 14 3 7 11 15 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vilvl.h vr4, vr12, vr11 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr12, vr11 // 1 3 5 7 9 11 13 15 + vilvl.h vr10, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr12, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr10, vr10, vr20, vr10, vr6 + identity_4x4_lsx vr12, vr12, vr20, vr12, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_identity_dct_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vreplgr2vr.h vr15, zero + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr13, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr14, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14 + + vshuf4i.d vr14, vr14, 0x01 + vsrari.h vr13, vr13, 4 + vsrari.h vr14, vr14, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr14 +.endm + +.macro inv_flipadst_identity_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13 + + vssrarni.h.w vr12, vr13, 12 + vssrarni.h.w vr10, vr11, 12 + + vilvl.h vr4, vr10, vr12 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr10, vr12 // 1 3 5 7 9 11 13 15 + vilvl.h vr11, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr13, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr11, vr11, vr20, vr11, vr6 + identity_4x4_lsx vr13, vr13, vr20, vr13, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_identity_flipadst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr11, vr5, vr4 + vilvh.h vr13, vr5, vr4 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr13, 0 // in2 + vexth.w.h vr5, vr13 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vssrarni.h.w vr0, vr1, 12 // 8 9 10 11 12 13 14 15 + vssrarni.h.w vr2, vr3, 12 // 0 1 2 3 4 5 6 7 + vsrari.h vr11, vr0, 4 + vsrari.h vr13, vr2, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr13, vr11 +.endm + +.macro inv_identity_adst_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr0, vr0, vr20, vr0, vr0 + identity_4x4_lsx vr1, vr1, vr20, vr1, vr1 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr11, vr5, vr4 + vilvh.h vr13, vr5, vr4 + + vreplgr2vr.h vr15, zero + vst vr15, a2, 0 + vst vr15, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr11, 0 // in0 + vexth.w.h vr3, vr11 // in1 + vsllwil.w.h vr4, vr13, 0 // in2 + vexth.w.h vr5, vr13 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + vssrarni.h.w vr1, vr0, 12 + vssrarni.h.w vr3, vr2, 12 + vsrari.h vr11, vr1, 4 + vsrari.h vr13, vr3, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro inv_adst_identity_4x4_lsx + vld vr0, a2, 0 + vld vr1, a2, 16 + + la.local t0, iadst4_coeffs + vsllwil.w.h vr2, vr0, 0 // in0 + vexth.w.h vr3, vr0 // in1 + vsllwil.w.h vr4, vr1, 0 // in2 + vexth.w.h vr5, vr1 // in3 + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3 + + LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7 + + vssrarni.h.w vr13, vr11, 12 + vssrarni.h.w vr14, vr12, 12 + + vreplgr2vr.h vr15, zero + li.w t0, 1697 + + vst vr15, a2, 0 + vst vr15, a2, 16 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr13, vr13, vr20, vr13, vr6 + identity_4x4_lsx vr14, vr14, vr20, vr14, vr7 + vsrari.h vr11, vr6, 4 + vsrari.h vr13, vr7, 4 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W4 vr11, vr13 +.endm + +.macro fun4x4 type1, type2 +function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx +.ifc \type1\()_\type2, dct_dct + bnez a3, .LLL + + vldi vr0, 0x8b5 // 181 + ld.h t2, a2, 0 // dc + st.h zero, a2, 0 + vreplgr2vr.w vr1, t2 + vldi vr3, 0x880 // 128 + vmul.w vr2, vr0, vr1 + vld vr10, a0, 0 + vsrari.w vr2, vr2, 8 + vldx vr11, a0, a1 + vmadd.w vr3, vr2, vr0 + alsl.d t2, a1, a0, 1 + vssrarni.h.w vr3, vr3, 12 + vld vr12, t2, 0 + vldx vr13, t2, a1 + + DST_ADD_W4 vr10, vr11, vr12, vr13, vr3, vr3 + + b .IDST_\type1\()_\type2\()_4X4_END +.LLL: +.endif + + inv_\type1\()_\type2\()_4x4_lsx +.IDST_\type1\()_\type2\()_4X4_END: +endfunc +.endm + +fun4x4 dct, dct +fun4x4 identity, identity +fun4x4 adst, dct +fun4x4 dct, adst +fun4x4 adst, adst +fun4x4 dct, flipadst +fun4x4 flipadst, adst +fun4x4 adst, flipadst +fun4x4 flipadst, dct +fun4x4 flipadst, flipadst +fun4x4 dct, identity +fun4x4 identity, dct +fun4x4 flipadst, identity +fun4x4 identity, flipadst +fun4x4 identity, adst +fun4x4 adst, identity + +function inv_txfm_add_dct_dct_4x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_4x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr10, a0, 0 + vmul.w vr2, vr2, vr0 + vldx vr11, a0, a1 + vsrari.w vr2, vr2, 8 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 + + DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + + VLD_DST_ADD_W4 vr5, vr5 + b .DCT_DCT_4x8_END + +.NO_HAS_DCONLY_4x8: + // sh=8 sw=4 + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr20, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr21, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + vldrepl.w vr2, t0, 8 // 1567 + vldrepl.w vr3, t0, 12 // 3784 + vldrepl.w vr8, t0, 0 // 2896 + +.macro DCT4_4Wx8H_1D_LSX + // in1 in3 + vsllwil.w.h vr4, vr1, 0 // in1 + vsllwil.w.h vr5, vr21, 0 // in3 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr6, vr4, vr3 + vmul.w vr7, vr4, vr2 + vmadd.w vr6, vr5, vr2 // t3 0 1 2 3 + vmsub.w vr7, vr5, vr3 // t2 0 1 2 3 + vexth.w.h vr4, vr1 // in1 + vexth.w.h vr5, vr21 // in3 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr9, vr4, vr3 + vmul.w vr10, vr4, vr2 + vmadd.w vr9, vr5, vr2 // t3 4 5 6 7 + vmsub.w vr10, vr5, vr3 // t2 4 5 6 7 + + // in0 in2 + vsllwil.w.h vr4, vr0, 0 // in0 + vsllwil.w.h vr5, vr20, 0 // in2 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr11, vr4, vr8 + vmul.w vr12, vr4, vr8 + vmadd.w vr11, vr5, vr8 // t0 0 1 2 3 + vmsub.w vr12, vr5, vr8 // t1 0 1 2 3 + vexth.w.h vr4, vr0 // in0 + vexth.w.h vr5, vr20 // in2 + vmul.w vr4, vr4, vr8 + vmul.w vr5, vr5, vr8 + vsrari.w vr4, vr4, 12 + vsrari.w vr5, vr5, 12 + vmul.w vr13, vr4, vr8 + vmul.w vr14, vr4, vr8 + vmadd.w vr13, vr5, vr8 // t0 4 5 6 7 + vmsub.w vr14, vr5, vr8 // t1 4 5 6 7 + vssrarni.h.w vr9, vr6, 12 // t3 + vssrarni.h.w vr10, vr7, 12 // t2 + vssrarni.h.w vr14, vr12, 12 // t1 + vssrarni.h.w vr13, vr11, 12 // t0 + vsadd.h vr4, vr13, vr9 // c[0] 0 4 8 12 16 20 24 28 + vsadd.h vr5, vr14, vr10 // c[1] 1 5 9 13 17 21 25 29 + vssub.h vr20, vr14, vr10 // c[2] 2 6 10 14 18 22 26 30 + vssub.h vr21, vr13, vr9 // c[3] 3 7 11 15 19 23 27 31 +.endm + + DCT4_4Wx8H_1D_LSX + + vreplgr2vr.h vr22, zero + vst vr22, a2, 0 + vst vr22, a2, 16 + vst vr22, a2, 32 + vst vr22, a2, 48 + + vilvl.h vr0, vr5, vr4 // 0 1 4 5 8 9 12 13 + vilvl.h vr1, vr21, vr20 // 2 3 6 7 10 11 14 15 + vilvh.h vr6, vr5, vr4 // 16 17 20 21 24 25 28 29 + vilvh.h vr7, vr21, vr20 // 18 19 22 23 26 27 30 31 + vilvl.w vr9, vr1, vr0 // 0 1 2 3 4 5 6 7 in0 + vilvh.w vr10, vr1, vr0 // 8 9 10 11 12 13 14 15 in1 + vilvl.w vr11, vr7, vr6 // 16 17 18 19 20 21 22 23 in2 + vilvh.w vr12, vr7, vr6 // 24 25 26 27 28 29 30 31 in3 + + vilvl.d vr0, vr10, vr9 + vilvl.d vr1, vr12, vr11 + vilvh.d vr20, vr9, vr11 // in5 in1 + vilvh.d vr21, vr12, vr10 // in3 in7 + +.macro DCT8_4Wx8H_1D_LSX + dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14 + + vldrepl.w vr17, t0, 16 // 799 + vldrepl.w vr18, t0, 20 // 4017 + vldrepl.w vr11, t0, 24 // 3406 + vldrepl.w vr12, t0, 28 // 2276 + + vexth.w.h vr4, vr20 + vexth.w.h vr5, vr21 + vmul.w vr6, vr4, vr18 // in1 * 4017 + vmul.w vr7, vr4, vr17 // in1 * 799 + vmadd.w vr6, vr5, vr17 // in7 * 799 + vmsub.w vr7, vr5, vr18 // in7 * 4017 + vsllwil.w.h vr4, vr20, 0 + vsllwil.w.h vr5, vr21, 0 + vmul.w vr9, vr4, vr12 + vmul.w vr10, vr4, vr11 + vmadd.w vr9, vr5, vr11 + vmsub.w vr10, vr5, vr12 + vssrarni.h.w vr10, vr9, 12 // t6a t5a + vssrarni.h.w vr7, vr6, 12 // t7a t4a + vsadd.h vr15, vr7, vr10 // t7 t4 + vssub.h vr16, vr7, vr10 // t6a t5a + + vexth.w.h vr4, vr16 // t5a + vsllwil.w.h vr5, vr16, 0 // t6a + vldi vr2, 0x8b5 // 181 + vsub.w vr6, vr5, vr4 + vadd.w vr7, vr5, vr4 + vmul.w vr6, vr6, vr2 + vmul.w vr7, vr7, vr2 + vssrarni.h.w vr7, vr6, 8 // t5 t6 + vaddi.hu vr18, vr7, 0 + vshuf4i.d vr7, vr15, 0x06 // t7 t6 + vshuf4i.d vr15, vr18, 0x09 // t4 t5 + + // vr17 -> vr7 vr18 -> vr15 + vsadd.h vr4, vr13, vr7 + vsadd.h vr5, vr14, vr15 + vssub.h vr6, vr14, vr15 + vssub.h vr7, vr13, vr7 +.endm + + DCT8_4Wx8H_1D_LSX + + vshuf4i.d vr5, vr5, 0x01 + vshuf4i.d vr7, vr7, 0x01 + + vsrari.h vr4, vr4, 4 + vsrari.h vr5, vr5, 4 + vsrari.h vr6, vr6, 4 + vsrari.h vr7, vr7, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W4 vr4, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + + VLD_DST_ADD_W4 vr6, vr7 +.DCT_DCT_4x8_END: +endfunc + +.macro rect2_w4_lsx in0, in1, in2, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in1 + vmul.w vr22, vr22, \in2 + vmul.w vr23, vr23, \in2 + vsrari.w \out0, vr22, 12 + vsrari.w \out1, vr23, 12 +.endm + +.macro dct_8x4_core_lsx1 out0, out1, out2, out3 + // dct4 stride=1<<1 + vmul.w vr0, vr6, vr21 + vmul.w vr1, vr6, vr20 + vmadd.w vr0, vr10, vr20 // t3 + vmsub.w vr1, vr10, vr21 // t2 + vmul.w vr2, vr18, vr22 + vmul.w vr3, vr18, vr22 + vmadd.w vr2, vr8, vr22 // t0 + vmsub.w vr3, vr8, vr22 // t1 + vssrarni.h.w vr1, vr0, 12 // t3 t2 + vssrarni.h.w vr3, vr2, 12 // t0 t1 + vsadd.h vr8, vr3, vr1 // t0 t1 + vssub.h vr10, vr3, vr1 // t3 t2 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vldrepl.w vr22, t0, 24 // 3406 + vldrepl.w vr23, t0, 28 // 2276 + + vmul.w vr0, vr19, vr21 // in1 * 4017 + vmul.w vr1, vr19, vr20 // in1 * 799 + vmadd.w vr0, vr11, vr20 // in7 * 799 // t7a + vmsub.w vr1, vr11, vr21 // in7 * 4017 // t4a + vmul.w vr2, vr9, vr23 // in5 * 1138 + vmul.w vr3, vr9, vr22 // in5 * 1703 + vmadd.w vr2, vr7, vr22 // in3 * 1703 // t6a + vmsub.w vr3, vr7, vr23 // in3 * 1138 // t5a + vssrarni.h.w vr0, vr1, 12 // t4a t7a + vssrarni.h.w vr2, vr3, 12 // t5a t6a + vsadd.h vr9, vr0, vr2 // t4 t7 + vssub.h vr11, vr0, vr2 // t5a t6a + + vldrepl.w vr22, t0, 0 // 2896 + vexth.w.h vr18, vr11 // t6a + vsllwil.w.h vr19, vr11, 0 // t5a + vmul.w vr6, vr18, vr22 + vmul.w vr7, vr18, vr22 + vmadd.w vr6, vr19, vr22 // t6 + vmsub.w vr7, vr19, vr22 // t5 + vssrarni.h.w vr6, vr7, 12 // t5 t6 + + vilvh.d vr11, vr6, vr9 // t7 t6 + vilvl.d vr9, vr6, vr9 // t4 t5 + + vsadd.h \out0, vr8, vr11 // c[0] c[1] + vsadd.h \out1, vr10, vr9 // c[3] c[2] + vssub.h \out2, vr10, vr9 // c[4] c[5] + vssub.h \out3, vr8, vr11 // c[7] c[6] +.endm + +.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 + vexth.w.h vr4, \in0 // in1 + vexth.w.h vr5, \in1 // in3 + vmul.w vr6, vr4, \in4 + vmul.w vr7, vr4, \in5 + vmadd.w vr6, vr5, \in5 // t3 + vmsub.w vr7, vr5, \in4 // t2 + vexth.w.h vr4, \in2 // in1 + vexth.w.h vr5, \in3 // in3 + vmul.w vr8, vr4, \in4 + vmul.w vr9, vr4, \in5 + vmadd.w vr8, vr5, \in5 // t3 + vmsub.w vr9, vr5, \in4 // t2 + vssrarni.h.w vr8, vr6, 12 // t3 + vssrarni.h.w vr9, vr7, 12 // t2 + + vsllwil.w.h vr4, \in0, 0 + vsllwil.w.h vr5, \in1, 0 + vmul.w vr11, vr4, \in6 + vmul.w vr12, vr4, \in7 + vmadd.w vr11, vr5, \in7 // t0 + vmsub.w vr12, vr5, \in6 // t1 + vsllwil.w.h vr4, \in2, 0 + vsllwil.w.h vr5, \in3, 0 + vmul.w vr13, vr4, \in6 + vmul.w vr14, vr4, \in7 + vmadd.w vr13, vr5, \in7 // t0 + vmsub.w vr14, vr5, \in6 // t1 + vssrarni.h.w vr13, vr11, 12 // t0 + vssrarni.h.w vr14, vr12, 12 // t1 + + vsadd.h \out0, vr13, vr8 + vsadd.h \out1, vr14, vr9 + vssub.h \out2, vr14, vr9 + vssub.h \out3, vr13, vr8 +.endm + +.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 + vsllwil.hu.bu vr10, \in0, 0 + vsllwil.hu.bu vr11, \in1, 0 + vsllwil.hu.bu vr12, \in2, 0 + vsllwil.hu.bu vr13, \in3, 0 + vadd.h vr10, \in4, vr10 + vadd.h vr11, \in5, vr11 + vadd.h vr12, \in6, vr12 + vadd.h vr13, \in7, vr13 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vstelm.d vr11, a0, 0, 0 + add.d t8, a0, a1 + vstelm.d vr11, t8, 0, 1 + vstelm.d vr13, t2, 0, 0 + add.d t8, t2, a1 + vstelm.d vr13, t8, 0, 1 +.endm + +.macro VLD_DST_ADD_W8 in0, in1, in2, in3 + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + + DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 +.endm + +function inv_txfm_add_dct_dct_8x4_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x4 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr10, a0, 0 + vmul.w vr2, vr2, vr0 + vldx vr11, a0, a1 + vsrari.w vr2, vr2, 8 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X4_END + +.NO_HAS_DCONLY_8x4: + la.local t0, idct_coeffs + + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 in0 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 in1 + vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 + vilvl.h vr2, vr5, vr4 // 16 - 23 in2 + vilvh.h vr3, vr5, vr4 // 24 - 31 in3 + + la.local t0, idct_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 + +.DCT_DCT_8X4_END: +endfunc + +.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 + vssrarni.h.w \in1, \in0, 0 + vssrarni.h.w \in3, \in2, 0 + vssrarni.h.w \in5, \in4, 0 + vssrarni.h.w \in7, \in6, 0 + vsadd.h \out0, \in1, \in1 + vsadd.h \out1, \in3, \in3 + vsadd.h \out2, \in5, \in5 + vsadd.h \out3, \in7, \in7 +.endm + +function inv_txfm_add_identity_identity_8x4_8bpc_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + identity_4x4_lsx vr19, vr19, vr20, vr19, vr19 + identity_4x4_lsx vr7, vr7, vr20, vr7, vr7 + identity_4x4_lsx vr9, vr9, vr20, vr9, vr9 + identity_4x4_lsx vr11, vr11, vr20, vr11, vr11 + + vsrari.h vr15, vr19, 4 + vsrari.h vr16, vr7, 4 + vsrari.h vr17, vr9, 4 + vsrari.h vr18, vr11, 4 + + vilvl.h vr4, vr16, vr15 + vilvh.h vr5, vr16, vr15 + vilvl.h vr11, vr5, vr4 + vilvh.h vr12, vr5, vr4 + vilvl.h vr4, vr18, vr17 + vilvh.h vr5, vr18, vr17 + vilvl.h vr13, vr5, vr4 + vilvh.h vr14, vr5, vr4 + vilvl.d vr15, vr13, vr11 + vilvh.d vr16, vr13, vr11 + vilvl.d vr17, vr14, vr12 + vilvh.d vr18, vr14, vr12 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +const iadst8_coeffs, align=4 + .word 4076, 401, 3612, 1931 + .word 2598, 3166, 1189, 3920 + // idct_coeffs + .word 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, out0, out1, out2, out3 + vmul.w \out0, \in0, \in4 + vmul.w \out1, \in0, \in5 + vmadd.w \out0, \in1, \in6 // t0a + vmsub.w \out1, \in1, \in7 // t1a + vmul.w \out2, \in2, \in8 + vmul.w \out3, \in2, \in9 + vmadd.w \out2, \in3, \in10 // t2a + vmsub.w \out3, \in3, \in11 // t3a + vssrarni.h.w \out1, \out0, 12 // t0a t1a + vssrarni.h.w \out3, \out2, 12 // t2a t3a +.endm + +.macro adst8x4_1d_lsx + la.local t0, iadst8_coeffs + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + vldrepl.w vr22, t0, 8 // 3612 + vldrepl.w vr23, t0, 12 // 1931 + + // vr13 t0a t1a vr15 t2a t3a + vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 + vldrepl.w vr20, t0, 16 // 2598 + vldrepl.w vr21, t0, 20 // 3166 + vldrepl.w vr22, t0, 24 // 1189 + vldrepl.w vr23, t0, 28 // 3920 + + // vr18 t4a t5a vr6 t6a t7a + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 + + vsadd.h vr12, vr13, vr18 // t0 t1 + vsadd.h vr14, vr15, vr6 // t2 t3 + vssub.h vr16, vr13, vr18 // t4 t5 + vssub.h vr18, vr15, vr6 // t6 t7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr7, vr16, 0 // t4 + vexth.w.h vr8, vr16 // t5 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr13 out0 out7 vr17 out1 out6 + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ + vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19 + vshuf4i.d vr19, vr19, 0x01 + + vsadd.h vr13, vr12, vr14 // out0 out7 + vssub.h vr16, vr12, vr14 // t2 t3 + vsadd.h vr17, vr15, vr19 // out1 out6 + vssub.h vr18, vr15, vr19 // t6 t7 + + vexth.w.h vr20, vr13 // out7 + vsllwil.w.h vr21, vr17, 0 // out1 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out7 out1 + vilvl.d vr13, vr21, vr13 // out0 out7 + vilvh.d vr17, vr17, vr21 // out1 out6 + + vsllwil.w.h vr7, vr16, 0 // t2 + vexth.w.h vr8, vr16 // t3 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr15 out[3] out[4] vr18 out[2] out[5] + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ + vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 + + vexth.w.h vr20, vr18 // out5 + vsllwil.w.h vr21, vr15, 0 // out3 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out5 out3 + vilvl.d vr18, vr21, vr18 // out2 out5 + vilvh.d vr15, vr15, vr21 // out3 out4 +.endm + +function inv_txfm_add_adst_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_dct_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_adst_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // 0 1 2 3 4 5 6 7 in0 + vld vr1, a2, 16 // 8 9 10 11 12 13 14 15 in1 + vld vr2, a2, 32 // 16 17 18 19 20 21 22 23 in2 + vld vr3, a2, 48 // 24 25 26 27 28 29 30 31 in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr2, 0 + vexth.w.h vr11, vr2 + vsllwil.w.h vr12, vr3, 0 + vexth.w.h vr13, vr3 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr0, 0 + vexth.w.h vr15, vr0 + vsllwil.w.h vr16, vr1, 0 + vexth.w.h vr17, vr1 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr2, vr5, vr4 + vilvh.w vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx + la.local t0, idct_coeffs + + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 // in0 + vexth.w.h vr11, vr0 // in1 + vsllwil.w.h vr12, vr1, 0 // in2 + vexth.w.h vr13, vr1 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr2, 0 // in0 + vexth.w.h vr11, vr2 // in1 + vsllwil.w.h vr12, vr3, 0 // in2 + vexth.w.h vr13, vr3 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr0, 0 + vexth.w.h vr15, vr0 + vsllwil.w.h vr16, vr1, 0 + vexth.w.h vr17, vr1 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_dct_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr0, vr1, vr2, vr3 + + vshuf4i.d vr1, vr1, 0x01 + vshuf4i.d vr3, vr3, 0x01 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr3, vr2 + vilvh.h vr5, vr3, vr2 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + vilvl.d vr14, vr2, vr0 + vilvh.d vr15, vr2, vr0 + vilvl.d vr16, vr3, vr1 + vilvh.d vr17, vr3, vr1 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 + identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 + identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 + identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_identity_dct_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vilvl.h vr4, vr7, vr19 + vilvh.h vr5, vr7, vr19 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr11, vr9 + vilvh.h vr5, vr11, vr9 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \ + vr22, vr15, vr16, vr17, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr20, vr15, vr13 + vilvl.h vr21, vr18, vr17 + vilvl.w vr0, vr21, vr20 + vilvh.w vr1, vr21, vr20 + vilvh.h vr20, vr15, vr13 + vilvh.h vr21, vr18, vr17 + vilvl.w vr2, vr21, vr20 + vilvh.w vr3, vr21, vr20 + vshuf4i.h vr0, vr0, 0x2d + vshuf4i.h vr1, vr1, 0x2d + vshuf4i.h vr2, vr2, 0x78 + vshuf4i.h vr3, vr3, 0x78 + vilvl.d vr14, vr0, vr2 // in0 + vilvh.d vr15, vr0, vr2 // in1 + vilvl.d vr16, vr1, vr3 // in2 + vilvh.d vr17, vr1, vr3 // in3 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr14, vr14, vr20, vr14, vr14 + identity_4x4_lsx vr15, vr15, vr20, vr15, vr15 + identity_4x4_lsx vr16, vr16, vr20, vr16, vr16 + identity_4x4_lsx vr17, vr17, vr20, vr17, vr17 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr19, vr7, vr9, vr11 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vilvl.h vr4, vr7, vr19 + vilvh.h vr5, vr7, vr19 + vilvl.h vr0, vr5, vr4 + vilvh.h vr1, vr5, vr4 + vilvl.h vr4, vr11, vr9 + vilvh.h vr5, vr11, vr9 + vilvl.h vr2, vr5, vr4 + vilvh.h vr3, vr5, vr4 + + la.local t0, iadst4_coeffs + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 // in0 + vexth.w.h vr11, vr0 // in1 + vsllwil.w.h vr12, vr1, 0 // in2 + vexth.w.h vr13, vr1 // in3 + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr16, vr15, vr14 +endfunc + +function inv_txfm_add_adst_identity_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0 8 16 24 1 9 17 25 in0 in1 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // 2 10 18 26 3 11 19 27 in2 in3 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // 4 12 20 28 5 13 21 29 in4 in5 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7 + + adst8x4_1d_lsx + + vilvl.h vr4, vr17, vr13 + vilvl.h vr5, vr15, vr18 + vilvl.w vr14, vr5, vr4 // in0 in1 + vilvh.w vr16, vr5, vr4 // in2 in3 + vilvh.h vr4, vr18, vr15 + vilvh.h vr5, vr13, vr17 + vilvl.w vr17, vr5, vr4 + vilvh.w vr18, vr5, vr4 + vilvl.d vr10, vr17, vr14 // in0 + vilvh.d vr11, vr17, vr14 // in1 + vilvl.d vr12, vr18, vr16 // in2 + vilvh.d vr13, vr18, vr16 // in3 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + + identity_4x4_lsx vr10, vr10, vr20, vr10, vr15 + identity_4x4_lsx vr11, vr11, vr20, vr11, vr16 + identity_4x4_lsx vr12, vr12, vr20, vr12, vr17 + identity_4x4_lsx vr13, vr13, vr20, vr13, vr18 + + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + vsrari.h vr18, vr18, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr15, vr16, vr17, vr18 +endfunc + +function inv_txfm_add_identity_adst_8x4_8bpc_lsx + vld vr0, a2, 0 // in0 + vld vr1, a2, 16 // in1 + vld vr2, a2, 32 // in2 + vld vr3, a2, 48 // in3 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 0 // 2896 + + rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7 + rect2_w4_lsx vr1, vr1, vr20, vr6, vr7 // in1 8 - 15 + rect2_w4_lsx vr2, vr2, vr20, vr8, vr9 // in2 16 - 23 + rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31 + + identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \ + vr0, vr1, vr2, vr3 + + vilvl.h vr4, vr1, vr0 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr1, vr0 // 1 3 5 7 9 11 13 15 + vilvl.h vr0, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr1, vr5, vr4 // 8 9 10 11 12 13 14 15 + vilvl.h vr4, vr3, vr2 // 0 2 4 6 8 10 12 14 + vilvh.h vr5, vr3, vr2 // 1 3 5 7 9 11 13 15 + vilvl.h vr2, vr5, vr4 // 0 1 2 3 4 5 6 7 + vilvh.h vr3, vr5, vr4 // 8 9 10 11 12 13 14 15 + + vreplgr2vr.h vr23, zero + vst vr23, a2, 0 + vst vr23, a2, 16 + vst vr23, a2, 32 + vst vr23, a2, 48 + + la.local t0, iadst4_coeffs + + vldrepl.w vr20, t0, 0 // 1321 + vldrepl.w vr21, t0, 4 // 3803 + vldrepl.w vr22, t0, 8 // 2482 + vldrepl.w vr23, t0, 12 // 3344 + + vsllwil.w.h vr10, vr0, 0 + vexth.w.h vr11, vr0 + vsllwil.w.h vr12, vr1, 0 + vexth.w.h vr13, vr1 + + adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 + + vsllwil.w.h vr14, vr2, 0 + vexth.w.h vr15, vr2 + vsllwil.w.h vr16, vr3, 0 + vexth.w.h vr17, vr3 + + adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17 + + vssrarni.h.w vr14, vr10, 12 + vssrarni.h.w vr15, vr11, 12 + vssrarni.h.w vr16, vr12, 12 + vssrarni.h.w vr17, vr13, 12 + + vsrari.h vr14, vr14, 4 + vsrari.h vr15, vr15, 4 + vsrari.h vr16, vr16, 4 + vsrari.h vr17, vr17, 4 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 +endfunc + +function inv_txfm_add_identity_identity_8x8_8bpc_lsx + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr14, 1 + vsllwil.w.h vr13, vr15, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr14, vr12, 1 // in6 + vssrarni.h.w vr15, vr13, 1 // in7 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \ + vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13 + + vsllwil.w.h vr6, vr16, 1 + vsllwil.w.h vr7, vr17, 1 + vsllwil.w.h vr8, vr18, 1 + vsllwil.w.h vr9, vr19, 1 + vsllwil.w.h vr10, vr20, 1 + vsllwil.w.h vr11, vr21, 1 + vsllwil.w.h vr12, vr22, 1 + vsllwil.w.h vr13, vr23, 1 + +.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr16, vr6, 4 // in0 + vssrarni.h.w vr17, vr7, 4 // in1 + vssrarni.h.w vr18, vr8, 4 // in2 + vssrarni.h.w vr19, vr9, 4 // in3 + vssrarni.h.w vr20, vr10, 4 // in4 + vssrarni.h.w vr21, vr11, 4 // in5 + vssrarni.h.w vr22, vr12, 4 // in6 + vssrarni.h.w vr23, vr13, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + +endfunc + +.macro adst8x8_1d_lsx out0, out1, out2, out3 + la.local t0, iadst8_coeffs + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + vldrepl.w vr22, t0, 8 // 3612 + vldrepl.w vr23, t0, 12 // 1931 + + // vr13 t0a t1a vr15 t2a t3a + vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15 + vldrepl.w vr20, t0, 16 // 2598 + vldrepl.w vr21, t0, 20 // 3166 + vldrepl.w vr22, t0, 24 // 1189 + vldrepl.w vr23, t0, 28 // 3920 + + // vr18 t4a t5a vr6 t6a t7a + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \ + vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6 + + vsadd.h vr12, vr13, vr18 // t0 t1 + vsadd.h vr14, vr15, vr6 // t2 t3 + vssub.h vr9, vr13, vr18 // t4 t5 + vssub.h vr18, vr15, vr6 // t6 t7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr7, vr9, 0 // t4 + vexth.w.h vr8, vr9 // t5 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr13 out0 out7 vr17 out1 out6 + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \ + vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19 + vshuf4i.d vr19, vr19, 0x01 + + vsadd.h vr13, vr12, vr14 // out0 out7 + vssub.h vr6, vr12, vr14 // t2 t3 + vsadd.h vr7, vr15, vr19 // out1 out6 + vssub.h vr18, vr15, vr19 // t6 t7 + + vexth.w.h vr20, vr13 // out7 + vsllwil.w.h vr21, vr7, 0 // out1 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out7 out1 + vilvl.d \out0, vr21, vr13 // out0 out7 + vilvh.d \out1, vr7, vr21 // out1 out6 + + vsllwil.w.h vr7, vr6, 0 // t2 + vexth.w.h vr8, vr6 // t3 + vsllwil.w.h vr10, vr18, 0 // t6 + vexth.w.h vr11, vr18 // t7 + + // vr15 out[3] out[4] vr18 out[2] out[5] + vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \ + vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18 + + vexth.w.h vr20, vr18 // out5 + vsllwil.w.h vr21, vr15, 0 // out3 + vneg.w vr20, vr20 + vneg.w vr21, vr21 + vssrarni.h.w vr21, vr20, 0 // out5 out3 + vilvl.d \out2, vr21, vr18 // out2 out5 + vilvh.d \out3, vr15, vr21 // out3 out4 +.endm + +function inv_txfm_add_adst_dct_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr14, vr14, 0x1b + vshuf4i.h vr15, vr15, 0x1b + vshuf4i.h vr24, vr24, 0x1b + vshuf4i.h vr25, vr25, 0x1b + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr12 + vexth.w.h vr11, vr13 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr4, vr5, vr12, vr13 + + vshuf4i.d vr5, vr5, 0x01 + vshuf4i.d vr13, vr13, 0x01 + + vsllwil.w.h vr18, vr14, 0 + vsllwil.w.h vr19, vr15, 0 + vsllwil.w.h vr6, vr24, 0 + vsllwil.w.h vr7, vr25, 0 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr14, vr15, vr24, vr25 + + vshuf4i.d vr15, vr15, 0x01 + vshuf4i.d vr25, vr25, 0x01 + + vilvl.d vr20, vr14, vr4 + vilvh.d vr21, vr14, vr4 + vilvl.d vr22, vr15, vr5 + vilvh.d vr23, vr15, vr5 + vilvl.d vr16, vr24, vr12 + vilvh.d vr17, vr24, vr12 + vilvl.d vr18, vr25, vr13 + vilvh.d vr19, vr25, vr13 + +.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_adst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr27, vr26 // 1 3 5 7 9 11 13 15 + vilvl.h vr26, vr9, vr8 // 0 - 7 in0 + vilvh.h vr27, vr9, vr8 // 8 - 15 in1 + vilvl.h vr8, vr29, vr28 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr29, vr28 // 1 3 5 7 9 11 13 15 + vilvl.h vr28, vr9, vr8 // 16 - 23 in2 + vilvh.h vr29, vr9, vr8 // 24 - 31 in3 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr13, vr12 // 1 3 5 7 9 11 13 15 + vilvl.h vr12, vr9, vr8 // 0 - 7 in0 + vilvh.h vr13, vr9, vr8 // 8 - 15 in1 + vilvl.h vr8, vr15, vr14 // 0 2 4 6 8 10 12 14 + vilvh.h vr9, vr15, vr14 // 1 3 5 7 9 11 13 15 + vilvl.h vr14, vr9, vr8 // 16 - 23 in2 + vilvh.h vr15, vr9, vr8 // 24 - 31 in3 + + vsrari.h vr0, vr12, 1 // in4low in5low + vsrari.h vr1, vr13, 1 // in6low in7low + vsrari.h vr2, vr14, 1 // in4high in5high + vsrari.h vr3, vr15, 1 // in6high in7high + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr0, 0 // in3 + vexth.w.h vr9, vr0 // in4 + vsllwil.w.h vr10, vr1, 0 // in5 + vexth.w.h vr11, vr1 // in6 + adst8x8_1d_lsx vr26, vr27, vr0, vr1 + + vsllwil.w.h vr18, vr28, 0 // in0 + vexth.w.h vr19, vr28 // in1 + vsllwil.w.h vr6, vr29, 0 // in2 + vexth.w.h vr7, vr29 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr28, vr29, vr16, vr17 + + vilvl.d vr4, vr28, vr26 // 0 ... 7 + vilvl.d vr5, vr29, vr27 // 8 ... 15 + vilvl.d vr6, vr16, vr0 // 16 ... 23 + vilvl.d vr7, vr17, vr1 // 24 ... 31 + vilvh.d vr14, vr17, vr1 // 32 ... 39 + vilvh.d vr15, vr16, vr0 // 40 ... 47 + vilvh.d vr16, vr29, vr27 // 48 ... 55 + vilvh.d vr17, vr28, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_adst_adst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr16 // in5 + vexth.w.h vr11, vr17 // in6 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr4, vr4, 0x1b + vshuf4i.h vr5, vr5, 0x1b + vshuf4i.h vr24, vr24, 0x1b + vshuf4i.h vr25, vr25, 0x1b + + vsllwil.w.h vr18, vr14, 0 + vsllwil.w.h vr19, vr15, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vexth.w.h vr8, vr14 // in3 + vexth.w.h vr9, vr15 // in4 + vexth.w.h vr10, vr12 // in5 + vexth.w.h vr11, vr13 // in6 + + adst8x8_1d_lsx vr26, vr27, vr0, vr1 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr24, 0 + vsllwil.w.h vr7, vr25, 0 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr24 // in5 + vexth.w.h vr11, vr25 // in6 + + adst8x8_1d_lsx vr24, vr25, vr16, vr17 + + vilvl.d vr4, vr24, vr26 // 0 ... 7 + vilvl.d vr5, vr25, vr27 // 8 ... 15 + vilvl.d vr6, vr16, vr0 // 16 ... 23 + vilvl.d vr7, vr17, vr1 // 24 ... 31 + vilvh.d vr14, vr17, vr1 // 32 ... 39 + vilvh.d vr15, vr16, vr0 // 40 ... 47 + vilvh.d vr16, vr25, vr27 // 48 ... 55 + vilvh.d vr17, vr24, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vexth.w.h vr19, vr24 // in1 + vsllwil.w.h vr6, vr25, 0 // in2 + vexth.w.h vr7, vr25 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvl.d vr20, vr0, vr4 // 0 ... 7 + vilvl.d vr21, vr1, vr5 // 8 ... 15 + vilvl.d vr22, vr2, vr16 // 16 ... 23 + vilvl.d vr23, vr3, vr17 // 24 ... 31 + vilvh.d vr14, vr3, vr17 // 32 ... 39 + vilvh.d vr15, vr2, vr16 // 40 ... 47 + vilvh.d vr16, vr1, vr5 // 48 ... 55 + vilvh.d vr17, vr0, vr4 // 56 ... 63 + +.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr0, vr0, 0x1b + vshuf4i.h vr1, vr1, 0x1b + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + + vsllwil.w.h vr18, vr0, 0 // in0 + vsllwil.w.h vr19, vr1, 0 // in1 + vsllwil.w.h vr6, vr2, 0 // in2 + vsllwil.w.h vr7, vr3, 0 // in3 + vexth.w.h vr8, vr0 // in4 + vexth.w.h vr9, vr1 // in5 + vexth.w.h vr10, vr2 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vsllwil.w.h vr19, vr25, 0 // in1 + vsllwil.w.h vr6, vr26, 0 // in2 + vsllwil.w.h vr7, vr27, 0 // in3 + vexth.w.h vr8, vr24 // in4 + vexth.w.h vr9, vr25 // in5 + vexth.w.h vr10, vr26 // in6 + vexth.w.h vr11, vr27 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr20, vr4, vr0 + vilvh.d vr21, vr5, vr1 + vilvh.d vr22, vr16, vr2 + vilvh.d vr23, vr17, vr3 + vilvl.d vr14, vr17, vr3 + vilvl.d vr15, vr16, vr2 + vilvl.d vr18, vr5, vr1 + vilvl.d vr19, vr4, vr0 + +.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsrari.h vr24, vr24, 1 + vsrari.h vr25, vr25, 1 + vsrari.h vr26, vr26, 1 + vsrari.h vr27, vr27, 1 + vsrari.h vr14, vr0, 1 + vsrari.h vr15, vr1, 1 + vsrari.h vr16, vr2, 1 + vsrari.h vr17, vr3, 1 + + vsllwil.w.h vr18, vr26, 0 + vexth.w.h vr19, vr26 + vsllwil.w.h vr6, vr27, 0 + vexth.w.h vr7, vr27 + vsllwil.w.h vr8, vr16, 0 + vexth.w.h vr9, vr16 + vsllwil.w.h vr10, vr17, 0 + vexth.w.h vr11, vr17 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr26, vr27, vr16, vr17 + + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + vshuf4i.h vr16, vr16, 0x1b + vshuf4i.h vr17, vr17, 0x1b + + vsllwil.w.h vr18, vr24, 0 + vexth.w.h vr19, vr24 + vsllwil.w.h vr6, vr25, 0 + vexth.w.h vr7, vr25 + vsllwil.w.h vr8, vr14, 0 + vexth.w.h vr9, vr14 + vsllwil.w.h vr10, vr15, 0 + vexth.w.h vr11, vr15 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr24, vr25, vr14, vr15 + + vilvl.d vr4, vr24, vr26 + vilvh.d vr5, vr24, vr26 + vilvh.d vr6, vr25, vr27 + vilvl.d vr7, vr25, vr27 + vilvl.d vr24, vr14, vr16 + vilvh.d vr25, vr14, vr16 + vilvh.d vr26, vr15, vr17 + vilvl.d vr27, vr15, vr17 + +.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 + vilvh.h vr9, vr27, vr26 + vilvl.h vr26, vr9, vr8 + vilvh.h vr27, vr9, vr8 + vilvl.h vr8, vr29, vr28 + vilvh.h vr9, vr29, vr28 + vilvl.h vr28, vr9, vr8 + vilvh.h vr29, vr9, vr8 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 + vilvh.h vr9, vr13, vr12 + vilvl.h vr12, vr9, vr8 + vilvh.h vr13, vr9, vr8 + vilvl.h vr8, vr15, vr14 + vilvh.h vr9, vr15, vr14 + vilvl.h vr14, vr9, vr8 + vilvh.h vr15, vr9, vr8 + + vsrari.h vr0, vr12, 1 + vsrari.h vr1, vr13, 1 + vsrari.h vr2, vr14, 1 + vsrari.h vr3, vr15, 1 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr28, 0 // in0 + vexth.w.h vr19, vr28 // in1 + vsllwil.w.h vr6, vr29, 0 // in2 + vexth.w.h vr7, vr29 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr26, vr4, vr0 + vilvh.d vr27, vr5, vr1 + vilvh.d vr28, vr16, vr2 + vilvh.d vr29, vr17, vr3 + vilvl.d vr20, vr17, vr3 + vilvl.d vr21, vr16, vr2 + vilvl.d vr22, vr5, vr1 + vilvl.d vr23, vr4, vr0 + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr26, vr27, vr28, vr29 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr0, vr20, vr21 + vilvh.w vr1, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr2, vr20, vr21 + vilvh.w vr3, vr20, vr21 + vshuf4i.h vr2, vr2, 0x1b + vshuf4i.h vr3, vr3, 0x1b + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr26, 0 // in0 + vexth.w.h vr19, vr26 // in1 + vsllwil.w.h vr6, vr27, 0 // in2 + vexth.w.h vr7, vr27 // in3 + vsllwil.w.h vr8, vr2, 0 // in4 + vexth.w.h vr9, vr2 // in5 + vsllwil.w.h vr10, vr3, 0 // in6 + vexth.w.h vr11, vr3 // in7 + adst8x8_1d_lsx vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr24, 0 // in0 + vexth.w.h vr19, vr24 // in1 + vsllwil.w.h vr6, vr25, 0 // in2 + vexth.w.h vr7, vr25 // in3 + vsllwil.w.h vr8, vr0, 0 // in4 + vexth.w.h vr9, vr0 // in5 + vsllwil.w.h vr10, vr1, 0 // in6 + vexth.w.h vr11, vr1 // in7 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr24, vr0, vr4 + vilvh.d vr25, vr1, vr5 + vilvh.d vr26, vr2, vr16 + vilvh.d vr27, vr3, vr17 + vilvl.d vr20, vr3, vr17 + vilvl.d vr21, vr2, vr16 + vilvl.d vr22, vr1, vr5 + vilvl.d vr23, vr0, vr4 + +.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr24, vr25, vr26, vr27 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_dct_identity_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr26, vr27, vr28, vr29 + vshuf4i.d vr27, vr27, 0x01 + vshuf4i.d vr29, vr29, 0x01 + + vilvl.h vr8, vr27, vr26 + vilvh.h vr9, vr27, vr26 + vilvl.h vr26, vr9, vr8 + vilvh.h vr27, vr9, vr8 + vilvl.h vr8, vr29, vr28 + vilvh.h vr9, vr29, vr28 + vilvl.h vr28, vr9, vr8 + vilvh.h vr29, vr9, vr8 + + vsrari.h vr26, vr26, 1 // in0low in1low + vsrari.h vr27, vr27, 1 // in2low in3low + vsrari.h vr28, vr28, 1 // in0high in1high + vsrari.h vr29, vr29, 1 // in2high in3high + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + dct_8x4_core_lsx1 vr12, vr13, vr14, vr15 + + vshuf4i.d vr13, vr13, 0x01 + vshuf4i.d vr15, vr15, 0x01 + + vilvl.h vr8, vr13, vr12 + vilvh.h vr9, vr13, vr12 + vilvl.h vr12, vr9, vr8 + vilvh.h vr13, vr9, vr8 + vilvl.h vr8, vr15, vr14 + vilvh.h vr9, vr15, vr14 + vilvl.h vr14, vr9, vr8 + vilvh.h vr15, vr9, vr8 + + vsrari.h vr20, vr12, 1 + vsrari.h vr21, vr13, 1 + vsrari.h vr22, vr14, 1 + vsrari.h vr23, vr15, 1 + + vreplgr2vr.h vr19, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr19, a2, \i +.endr + // identity8 + vsllwil.w.h vr10, vr26, 1 + vsllwil.w.h vr11, vr27, 1 + vsllwil.w.h vr16, vr28, 1 + vsllwil.w.h vr17, vr29, 1 + vsllwil.w.h vr6, vr20, 1 + vsllwil.w.h vr7, vr21, 1 + vsllwil.w.h vr18, vr22, 1 + vsllwil.w.h vr19, vr23, 1 + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr16, vr10, 4 // in0 + vssrarni.h.w vr28, vr26, 4 // in1 + vssrarni.h.w vr17, vr11, 4 // in2 + vssrarni.h.w vr29, vr27, 4 // in3 + vssrarni.h.w vr18, vr6, 4 // in4 + vssrarni.h.w vr22, vr20, 4 // in5 + vssrarni.h.w vr19, vr7, 4 // in6 + vssrarni.h.w vr23, vr21, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr28, vr17, vr29 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr18, vr22, vr19, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_identity_dct_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + + // dct4 in0 in2 in4 in6 + vsllwil.w.h vr18, vr4, 0 + vsllwil.w.h vr19, vr5, 0 + vsllwil.w.h vr6, vr12, 0 + vsllwil.w.h vr7, vr13, 0 + vsllwil.w.h vr8, vr14, 0 + vsllwil.w.h vr9, vr15, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + dct_8x4_core_lsx1 vr16, vr17, vr26, vr27 + + vexth.w.h vr18, vr4 + vexth.w.h vr19, vr5 + vexth.w.h vr6, vr12 + vexth.w.h vr7, vr13 + vexth.w.h vr8, vr14 + vexth.w.h vr9, vr15 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vldrepl.w vr22, t0, 0 // 2896 + dct_8x4_core_lsx1 vr4, vr5, vr24, vr25 + + vilvl.d vr8, vr4, vr16 + vilvh.d vr9, vr4, vr16 + vilvh.d vr6, vr5, vr17 + vilvl.d vr7, vr5, vr17 + vilvl.d vr16, vr24, vr26 + vilvh.d vr17, vr24, vr26 + vilvh.d vr18, vr25, vr27 + vilvl.d vr19, vr25, vr27 + +.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr8, vr9, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr24, vr20, vr21 + vilvh.w vr25, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr26, vr20, vr21 + vilvh.w vr27, vr20, vr21 + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr16 // in5 + vexth.w.h vr11, vr17 // in6 + adst8x8_1d_lsx vr12, vr13, vr14, vr15 + + vilvl.h vr20, vr12, vr13 + vilvl.h vr21, vr14, vr15 + vilvl.w vr16, vr20, vr21 + vilvh.w vr17, vr20, vr21 + vilvh.h vr20, vr12, vr13 + vilvh.h vr21, vr14, vr15 + vilvl.w vr18, vr20, vr21 + vilvh.w vr19, vr20, vr21 + vshuf4i.h vr18, vr18, 0x1b + vshuf4i.h vr19, vr19, 0x1b + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vsrari.h \i, \i, 1 +.endr + + // identity8 + vsllwil.w.h vr20, vr24, 1 + vsllwil.w.h vr21, vr25, 1 + vsllwil.w.h vr12, vr26, 1 + vsllwil.w.h vr13, vr27, 1 + vsllwil.w.h vr22, vr16, 1 + vsllwil.w.h vr23, vr17, 1 + vsllwil.w.h vr14, vr18, 1 + vsllwil.w.h vr15, vr19, 1 + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vexth.w.h \i, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr20, vr12, 4 // in0 + vssrarni.h.w vr24, vr26, 4 // in1 + vssrarni.h.w vr21, vr13, 4 // in2 + vssrarni.h.w vr25, vr27, 4 // in3 + vssrarni.h.w vr22, vr14, 4 // in4 + vssrarni.h.w vr16, vr18, 4 // in5 + vssrarni.h.w vr23, vr15, 4 // in6 + vssrarni.h.w vr17, vr19, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr24, vr21, vr25 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr16, vr23, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr0, 0 // in0 + vsllwil.w.h vr19, vr1, 0 // in1 + vsllwil.w.h vr6, vr2, 0 // in2 + vsllwil.w.h vr7, vr3, 0 // in3 + vsllwil.w.h vr8, vr4, 0 // in3 + vsllwil.w.h vr9, vr5, 0 // in4 + vsllwil.w.h vr10, vr24, 0 // in5 + vsllwil.w.h vr11, vr25, 0 // in6 + adst8x8_1d_lsx vr26, vr27, vr28, vr29 + + vexth.w.h vr18, vr0 // in0 + vexth.w.h vr19, vr1 // in1 + vexth.w.h vr6, vr2 // in2 + vexth.w.h vr7, vr3 // in3 + vexth.w.h vr8, vr4 // in3 + vexth.w.h vr9, vr5 // in4 + vexth.w.h vr10, vr24 // in5 + vexth.w.h vr11, vr25 // in6 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvh.d vr4, vr0, vr26 + vilvh.d vr5, vr1, vr27 + vilvh.d vr6, vr2, vr28 + vilvh.d vr7, vr3, vr29 + vilvl.d vr14, vr3, vr29 + vilvl.d vr15, vr2, vr28 + vilvl.d vr16, vr1, vr27 + vilvl.d vr17, vr0, vr26 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 + +endfunc + +function inv_txfm_add_adst_identity_8x8_8bpc_lsx + addi.d sp, sp, -32 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17 + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr16, 0 + vsllwil.w.h vr11, vr17, 0 + adst8x8_1d_lsx vr24, vr25, vr26, vr27 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr16 + vexth.w.h vr11, vr17 + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + +.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3 + vsrari.h \i, \i, 1 +.endr + + LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \ + vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17 + + vshuf4i.h vr26, vr26, 0x1b + vshuf4i.h vr27, vr27, 0x1b + vshuf4i.h vr22, vr22, 0x1b + vshuf4i.h vr23, vr23, 0x1b + + // identity8 + vsllwil.w.h vr16, vr24, 1 + vsllwil.w.h vr17, vr25, 1 + vsllwil.w.h vr10, vr20, 1 + vsllwil.w.h vr11, vr21, 1 + vsllwil.w.h vr18, vr26, 1 + vsllwil.w.h vr19, vr27, 1 + vsllwil.w.h vr14, vr22, 1 + vsllwil.w.h vr15, vr23, 1 + +.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 + vexth.w.h \i, \i +.endr + +.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr18, vr16, 4 // in0 + vssrarni.h.w vr19, vr17, 4 // in1 + vssrarni.h.w vr14, vr10, 4 // in2 + vssrarni.h.w vr15, vr11, 4 // in3 + vssrarni.h.w vr26, vr24, 4 // in4 + vssrarni.h.w vr27, vr25, 4 // in5 + vssrarni.h.w vr22, vr20, 4 // in6 + vssrarni.h.w vr23, vr21, 4 // in7 + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr18, vr19, vr14, vr15 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr26, vr27, vr22, vr23 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + addi.d sp, sp, 32 +endfunc + +function inv_txfm_add_identity_adst_8x8_8bpc_lsx + addi.d sp, sp, -48 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + + // identity8 + vsllwil.w.h vr6, vr0, 1 + vsllwil.w.h vr7, vr1, 1 + vsllwil.w.h vr8, vr2, 1 + vsllwil.w.h vr9, vr3, 1 + vsllwil.w.h vr10, vr4, 1 + vsllwil.w.h vr11, vr5, 1 + vsllwil.w.h vr12, vr24, 1 + vsllwil.w.h vr13, vr25, 1 + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vexth.w.h \i, \i +.endr + +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w vr0, vr6, 1 // in0 + vssrarni.h.w vr1, vr7, 1 // in1 + vssrarni.h.w vr2, vr8, 1 // in2 + vssrarni.h.w vr3, vr9, 1 // in3 + vssrarni.h.w vr4, vr10, 1 // in4 + vssrarni.h.w vr5, vr11, 1 // in5 + vssrarni.h.w vr24, vr12, 1 // in6 + vssrarni.h.w vr25, vr13, 1 // in7 + + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \ + vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + vsllwil.w.h vr18, vr0, 0 + vsllwil.w.h vr19, vr1, 0 + vsllwil.w.h vr6, vr2, 0 + vsllwil.w.h vr7, vr3, 0 + vsllwil.w.h vr8, vr4, 0 + vsllwil.w.h vr9, vr5, 0 + vsllwil.w.h vr10, vr24, 0 + vsllwil.w.h vr11, vr25, 0 + adst8x8_1d_lsx vr26, vr27, vr28, vr29 + + vexth.w.h vr18, vr0 + vexth.w.h vr19, vr1 + vexth.w.h vr6, vr2 + vexth.w.h vr7, vr3 + vexth.w.h vr8, vr4 + vexth.w.h vr9, vr5 + vexth.w.h vr10, vr24 + vexth.w.h vr11, vr25 + + adst8x8_1d_lsx vr0, vr1, vr2, vr3 + + vilvl.d vr4, vr0, vr26 // 0 ... 7 + vilvl.d vr5, vr1, vr27 // 8 ... 15 + vilvl.d vr6, vr2, vr28 // 16 ... 23 + vilvl.d vr7, vr3, vr29 // 24 ... 31 + vilvh.d vr14, vr3, vr29 // 32 ... 39 + vilvh.d vr15, vr2, vr28 // 40 ... 47 + vilvh.d vr16, vr1, vr27 // 48 ... 55 + vilvh.d vr17, vr0, vr26 // 56 ... 63 + +.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr16, vr17 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + addi.d sp, sp, 48 +endfunc + +.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \out0, vr22, \in2 + vmul.w \out1, vr23, \in2 + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmadd.w \out0, vr22, \in3 + vmadd.w \out1, vr23, \in3 +.endm + +.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \out0, vr22, \in2 + vmul.w \out1, vr23, \in2 + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmsub.w \out0, vr22, \in3 + vmsub.w \out1, vr23, \in3 +.endm + +.macro rect2_lsx in0, in1, out0 + vsllwil.w.h vr22, \in0, 0 // in1 + vexth.w.h \in0, \in0 // in1 + vmul.w vr22, vr22, \in1 + vmul.w \out0, \in0, \in1 + vssrarni.h.w \out0, vr22, 12 +.endm + +.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7, rect2 + + la.local t0, idct_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 0 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + + vmul_vmadd_w \in2, \in6, vr21, vr20, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t3 + vmul_vmsub_w \in2, \in6, vr20, vr21, vr8, vr10 + vssrarni.h.w vr10, vr8, 12 // t2 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in0, \in4, vr20, vr20, vr8, \in2 + vssrarni.h.w \in2, vr8, 12 // t0 + vmul_vmsub_w \in0, \in4, vr20, vr20, vr8, \in6 + vssrarni.h.w \in6, vr8, 12 // t1 + + vsadd.h vr8, \in2, vr9 // c[0] + vssub.h vr9, \in2, vr9 // c[3] + vsadd.h \in0, \in6, vr10 // c[1] + vssub.h vr10, \in6, vr10 // c[2] + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w \in1, \in7, vr21, vr20, \in2, \in4 + vssrarni.h.w \in4, \in2, 12 // t7a + vmul_vmsub_w \in1, \in7, vr20, vr21, \in2, \in6 + vssrarni.h.w \in6, \in2, 12 // t4a + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + vmul_vmadd_w \in5, \in3, vr21, vr20, \in2, \in1 + vssrarni.h.w \in1, \in2, 12 // t6a + vmul_vmsub_w \in5, \in3, vr20, vr21, \in2, \in7 + vssrarni.h.w \in7, \in2, 12 // t5a + + vsadd.h \in3, \in6, \in7 // t4 + vssub.h \in6, \in6, \in7 // t5a + vsadd.h \in5, \in4, \in1 // t7 + vssub.h \in4, \in4, \in1 // t6a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in4, \in6, vr20, vr20, \in2, \in1 + vssrarni.h.w \in1, \in2, 12 // t6 + vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 + vssrarni.h.w \in7, \in2, 12 // t5 + + vsadd.h \out0, vr8, \in5 // c[0] + vssub.h \out7, vr8, \in5 // c[7] + vsadd.h \out1, \in0, \in1 // c[1] + vssub.h \out6, \in0, \in1 // c[6] + vsadd.h \out2, vr10, \in7 // c[2] + vssub.h \out5, vr10, \in7 // c[5] + vsadd.h \out3, vr9, \in3 // c[3] + vssub.h \out4, vr9, \in3 // c[4] +.endm + +function inv_txfm_add_dct_dct_8x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X8_END + +.NO_HAS_DCONLY_8x8: + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23, no_rect2 + +.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 + +.DCT_DCT_8X8_END: + +endfunc + +.macro dct_8x16_core_lsx + dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 + vssrarni.h.w vr10, vr0, 12 // t15a + vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 + vssrarni.h.w vr29, vr0, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t14a + vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 + vssrarni.h.w vr24, vr0, 12 // t13a + vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 + vssrarni.h.w vr25, vr0, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t12a + vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t14a + vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vssrarni.h.w vr19, vr0, 12 // t10a + vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t13a + vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t12 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] +.endm + +function inv_txfm_add_dct_dct_8x16_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x16 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vmul.w vr2, vr0, vr2 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 + + b .DCT_DCT_8X16_END + +.NO_HAS_DCONLY_8x16: + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + dct_8x16_core_lsx + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.DCT_DCT_8X16_END: +endfunc + +.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2 + + la.local t0, idct_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 0 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + vsllwil.w.h vr8, \in0, 1 + vsllwil.w.h vr9, \in1, 1 + vsllwil.w.h vr10, \in2, 1 + vsllwil.w.h vr11, \in3, 1 + vsllwil.w.h vr12, \in4, 1 + vsllwil.w.h vr13, \in5, 1 + vsllwil.w.h vr14, \in6, 1 + vsllwil.w.h vr15, \in7, 1 + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vexth.w.h \i, \i +.endr + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vslli.w \i, \i, 1 +.endr + + vssrarni.h.w \in0, vr8, 1 + vssrarni.h.w \in1, vr9, 1 + vssrarni.h.w \in2, vr10, 1 + vssrarni.h.w \in3, vr11, 1 + vssrarni.h.w \in4, vr12, 1 + vssrarni.h.w \in5, vr13, 1 + vssrarni.h.w \in6, vr14, 1 + vssrarni.h.w \in7, vr15, 1 +.endm + +.macro identity_8x16_core_lsx in0, out0 + vsadd.h vr10, \in0, \in0 + vsllwil.w.h vr8, \in0, 0 + vexth.w.h \out0, \in0 + vmul.w vr8, vr8, vr20 + vmul.w \out0, \out0, vr20 + vssrarni.h.w \out0, vr8, 11 + vsadd.h \out0, \out0, vr10 +.endm + +function inv_txfm_add_identity_identity_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx + + vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27 + + identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + + LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \ + vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 + + LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \ + vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21 + + li.w t0, 1697 + vreplgr2vr.w vr20, t0 + +.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \ + vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27 + identity_8x16_core_lsx \i, \i + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr14, vr15, vr22, vr23 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr16, vr18, vr24, vr26 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr28, vr29, vr30, vr31 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr17, vr19, vr25, vr27 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7, rect2 + + la.local t0, iadst8_coeffs + +.ifc \rect2, rect2_lsx + vldrepl.w vr23, t0, 32 // 2896 +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + rect2_lsx \i, vr23, \i +.endr +.endif + + vldrepl.w vr20, t0, 0 // 4076 + vldrepl.w vr21, t0, 4 // 401 + + vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t0a low + vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10 + vssrarni.h.w vr10, vr8, 12 // t1a low + + vldrepl.w vr20, t0, 8 // 3612 + vldrepl.w vr21, t0, 12 // 1931 + vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0 + vssrarni.h.w vr0, vr8, 12 // t2a low + vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7 + vssrarni.h.w vr7, vr8, 12 // t3a low + + vldrepl.w vr20, t0, 16 // 2598 -> 1299 + vldrepl.w vr21, t0, 20 // 3166 -> 1583 + vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2 + vssrarni.h.w vr2, vr8, 12 // t4a low + vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5 + vssrarni.h.w vr5, vr8, 12 // t5a low + + vldrepl.w vr20, t0, 24 // 1189 + vldrepl.w vr21, t0, 28 // 3920 + vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3 + vssrarni.h.w vr3, vr8, 12 // t6a low + vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4 + vssrarni.h.w vr4, vr8, 12 // t7a low + + vsadd.h vr1, vr9, vr2 // t0 + vssub.h vr6, vr9, vr2 // t4 + vsadd.h vr8, vr10, vr5 // t1 + vssub.h vr2, vr10, vr5 // t5 + vsadd.h vr9, vr0, vr3 // t2 + vssub.h vr5, vr0, vr3 // t6 + vsadd.h vr10, vr7, vr4 // t3 + vssub.h vr0, vr7, vr4 // t7 + + vldrepl.w vr20, t0, 40 // 1567 + vldrepl.w vr21, t0, 44 // 3784 + vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4 + vssrarni.h.w vr4, vr3, 12 // t4a low + vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7 + vssrarni.h.w vr7, vr3, 12 // t5a low + + vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2 + vssrarni.h.w vr2, vr3, 12 // t7a low + vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6 + vssrarni.h.w vr6, vr3, 12 // t6a low + + vsadd.h \out0, vr1, vr9 // out[0] + vssub.h vr5, vr1, vr9 // t2 + vsadd.h vr3, vr8, vr10 // out[7] + vssub.h vr1, vr8, vr10 // t3 + vexth.w.h vr9, vr3 + vsllwil.w.h vr21, vr3, 0 + vneg.w \out7, vr9 + vneg.w vr21, vr21 + vssrarni.h.w \out7, vr21, 0 // out[7] + + vsadd.h vr8, vr4, vr6 // out[1] + vssub.h vr10, vr4, vr6 // t6 + vexth.w.h vr20, vr8 + vsllwil.w.h vr21, vr8, 0 + vneg.w \out1, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out1, vr21, 0 // out[1] + vsadd.h \out6, vr7, vr2 // out[6] + vssub.h vr4, vr7, vr2 // t7 + + vldrepl.w vr20, t0, 32 // 2896 + vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6 + vssrarni.h.w vr6, vr9, 12 // out[3] + vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4 + vssrarni.h.w \out4, vr9, 12 // out[4] + + vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2 + vssrarni.h.w \out2, vr9, 12 // out[2] + vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5 + vssrarni.h.w vr5, vr9, 12 // out[5] + + vexth.w.h vr20, vr6 + vsllwil.w.h vr21, vr6, 0 + vneg.w \out3, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out3, vr21, 0 // out[3] + + vexth.w.h vr20, vr5 + vsllwil.w.h vr21, vr5, 0 + vneg.w \out5, vr20 + vneg.w vr21, vr21 + vssrarni.h.w \out5, vr21, 0 // out[5] +.endm + +function inv_txfm_add_adst_dct_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10 + vssrarni.h.w vr10, vr0, 12 // t15a + vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29 + vssrarni.h.w vr29, vr0, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t14a + vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24 + vssrarni.h.w vr24, vr0, 12 // t13a + vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25 + vssrarni.h.w vr25, vr0, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t12a + vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t14a + vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vssrarni.h.w vr19, vr0, 12 // t10a + vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vssrarni.h.w vr26, vr0, 12 // t13a + vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27 + vssrarni.h.w vr27, vr0, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vssrarni.h.w vr31, vr0, 12 // t12 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30 + vssrarni.h.w vr30, vr0, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 4 +.endr + + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr22, vr18, vr17, vr28 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr20, vr14, vr15, vr16 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr27, vr30, vr23, vr21 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr29, vr26, vr25, vr24 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +const iadst16_coeffs, align=4 + .word 4091, 201, 3973, 995 + .word 3703, 1751, 3290, 2440 + .word 2751, 3035, 2106, 3513 + .word 1380, 3857, 601, 4052 +endconst + +.macro adst16_core_lsx transpose8x8, shift, vst + la.local t0, iadst16_coeffs + vldrepl.w vr20, t0, 0 // 4091 + vldrepl.w vr21, t0, 4 // 201 + + vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18 + vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19 + vssrarni.h.w vr18, vr16, 12 // t0 + vssrarni.h.w vr19, vr17, 12 // t1 + + vldrepl.w vr20, t0, 8 // 3973 + vldrepl.w vr21, t0, 12 // 995 + vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0 + vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15 + vssrarni.h.w vr0, vr16, 12 // t2 + vssrarni.h.w vr15, vr17, 12 // t3 + + vldrepl.w vr20, t0, 16 // 3703 + vldrepl.w vr21, t0, 20 // 1751 + vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2 + vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13 + vssrarni.h.w vr2, vr16, 12 // t4 + vssrarni.h.w vr13, vr17, 12 // t5 + + vldrepl.w vr20, t0, 24 // 3290 -> 1645 + vldrepl.w vr21, t0, 28 // 2440 -> 1220 + vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4 + vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11 + vssrarni.h.w vr4, vr16, 12 // t6 + vssrarni.h.w vr11, vr17, 12 // t7 + + vldrepl.w vr20, t0, 32 // 2751 + vldrepl.w vr21, t0, 36 // 3035 + vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6 + vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9 + vssrarni.h.w vr6, vr16, 12 // t8 + vssrarni.h.w vr9, vr17, 12 // t9 + + vldrepl.w vr20, t0, 40 // 2106 + vldrepl.w vr21, t0, 44 // 3513 + vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7 + vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8 + vssrarni.h.w vr7, vr16, 12 // t10 + vssrarni.h.w vr8, vr17, 12 // t11 + + vldrepl.w vr20, t0, 48 // 1380 + vldrepl.w vr21, t0, 52 // 3857 + vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5 + vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10 + vssrarni.h.w vr5, vr16, 12 // t12 + vssrarni.h.w vr10, vr17, 12 // t13 + + vldrepl.w vr20, t0, 56 // 601 + vldrepl.w vr21, t0, 60 // 4052 + vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3 + vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12 + vssrarni.h.w vr3, vr16, 12 // t14 + vssrarni.h.w vr12, vr17, 12 // t15 + + vsadd.h vr1, vr18, vr6 // t0a + vssub.h vr14, vr18, vr6 // t8a + vsadd.h vr16, vr19, vr9 // t1a + vssub.h vr17, vr19, vr9 // t9a + vsadd.h vr6, vr0, vr7 // t2a + vssub.h vr18, vr0, vr7 // t10a + vsadd.h vr9, vr15, vr8 // t3a + vssub.h vr19, vr15, vr8 // t11a + vsadd.h vr0, vr2, vr5 // t4a + vssub.h vr7, vr2, vr5 // t12a + vsadd.h vr8, vr13, vr10 // t5a + vssub.h vr15, vr13, vr10 // t13a + vsadd.h vr2, vr4, vr3 // t6a + vssub.h vr5, vr4, vr3 // t14a + vsadd.h vr10, vr11, vr12 // t7a + vssub.h vr13, vr11, vr12 // t15a + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11 + vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12 + vssrarni.h.w vr11, vr3, 12 // t8 + vssrarni.h.w vr12, vr4, 12 // t9 + + vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14 + vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17 + vssrarni.h.w vr14, vr3, 12 // t13 + vssrarni.h.w vr17, vr4, 12 // t12 + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15 + vssrarni.h.w vr7, vr3, 12 // t10 + vssrarni.h.w vr15, vr4, 12 // t11 + + vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18 + vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19 + vssrarni.h.w vr18, vr3, 12 // t15 + vssrarni.h.w vr19, vr4, 12 // t14 + + vsadd.h vr5, vr1, vr0 // t0 + vssub.h vr13, vr1, vr0 // t4 + vsadd.h vr3, vr16, vr8 // t1 + vssub.h vr4, vr16, vr8 // t5 + vsadd.h vr0, vr6, vr2 // t2 + vssub.h vr1, vr6, vr2 // t6 + vsadd.h vr8, vr9, vr10 // t3 + vssub.h vr16, vr9, vr10 // t7 + vsadd.h vr2, vr11, vr17 // t8a + vssub.h vr6, vr11, vr17 // t12a + vsadd.h vr9, vr12, vr14 // t9a + vssub.h vr10, vr12, vr14 // t13a + vsadd.h vr11, vr7, vr19 // t10a + vssub.h vr17, vr7, vr19 // t14a + vsadd.h vr12, vr15, vr18 // t11a + vssub.h vr14, vr15, vr18 // t15a + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18 + vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19 + vssrarni.h.w vr18, vr7, 12 // t4a + vssrarni.h.w vr19, vr15, 12 // t5a + + vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4 + vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13 + vssrarni.h.w vr4, vr7, 12 // t7a + vssrarni.h.w vr13, vr15, 12 // t6a + + vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1 + vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16 + vssrarni.h.w vr1, vr7, 12 // t12 + vssrarni.h.w vr16, vr15, 12 // t13 + + vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6 + vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10 + vssrarni.h.w vr6, vr7, 12 // t15 + vssrarni.h.w vr10, vr15, 12 // t14 + + vsadd.h vr14, vr5, vr0 // out[0] + vssub.h vr17, vr5, vr0 // t2a + vssub.h vr7, vr3, vr8 // t3a + vsadd.h vr15, vr3, vr8 // out[15] + vsllwil.w.h vr22, vr15, 0 + vexth.w.h vr15, vr15 + vneg.w vr22, vr22 + vneg.w vr15, vr15 + vssrarni.h.w vr15, vr22, 0 // out[15] + vsadd.h vr14, vr5, vr0 // out[0] + vssub.h vr17, vr5, vr0 // t2a + vssub.h vr7, vr3, vr8 // t3a + + vsadd.h vr3, vr19, vr4 // out[12] + vssub.h vr8, vr19, vr4 // t7 + vssub.h vr0, vr18, vr13 // t6 + vsadd.h vr5, vr18, vr13 // out[3] + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr5, vr5 + vneg.w vr22, vr22 + vneg.w vr5, vr5 + vssrarni.h.w vr5, vr22, 0 // out[3] + + vsadd.h vr13, vr9, vr12 // out[14] + vssub.h vr19, vr9, vr12 // t11 + vssub.h vr4, vr2, vr11 // t10 + vsadd.h vr18, vr2, vr11 // out[1] + vsllwil.w.h vr22, vr18, 0 + vexth.w.h vr18, vr18 + vneg.w vr22, vr22 + vneg.w vr18, vr18 + vssrarni.h.w vr18, vr22, 0 // out[1] + + vsadd.h vr2, vr1, vr10 // out[2] + vssub.h vr11, vr1, vr10 // t14a + vssub.h vr12, vr16, vr6 // t15a + vsadd.h vr9, vr16, vr6 // out[13] + vsllwil.w.h vr22, vr9, 0 + vexth.w.h vr9, vr9 + vneg.w vr22, vr22 + vneg.w vr9, vr9 + vssrarni.h.w vr9, vr22, 0 // out[13] + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10 + vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1 + vssrarni.h.w vr10, vr6, 12 // out[7] + + vsllwil.w.h vr7, vr10, 0 + vexth.w.h vr10, vr10 + vneg.w vr7, vr7 + vneg.w vr10, vr10 + vssrarni.h.w vr10, vr7, 0 + vssrarni.h.w vr1, vr16, 12 // out[8] + + vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17 + vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7 + vssrarni.h.w vr17, vr16, 12 // out[11] + + vsllwil.w.h vr0, vr17, 0 + vexth.w.h vr17, vr17 + vneg.w vr0, vr0 + vneg.w vr17, vr17 + vssrarni.h.w vr17, vr0, 0 + vssrarni.h.w vr7, vr6, 12 // out[4] + + vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0 + vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8 + vssrarni.h.w vr0, vr16, 12 // out[9] + + vsllwil.w.h vr4, vr0, 0 + vexth.w.h vr0, vr0 + vneg.w vr4, vr4 + vneg.w vr0, vr0 + vssrarni.h.w vr0, vr4, 0 + vssrarni.h.w vr8, vr6, 12 // out[6] + + vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4 + vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19 + vssrarni.h.w vr4, vr6, 12 // out[5] + + vsllwil.w.h vr24, vr4, 0 + vexth.w.h vr4, vr4 + vneg.w vr24, vr24 + vneg.w vr4, vr4 + vssrarni.h.w vr4, vr24, 0 + vssrarni.h.w vr19, vr16, 12 // out[10] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + vsrari.h \i, \i, \shift +.endr +.endif + +.ifnb \vst + vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 +.endif +// out0 out1 out2 out3 out4 out5 out6 out7 +// vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 +// out8 out9 out10 out11 out12 out13 out14 out15 +// vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 +.endm // adst16_core_lsx + +.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7 + fld.d f20, t2, 0 + fldx.d f21, t2, a1 + fld.d f22, t3, 0 + fldx.d f23, t3, a1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + fld.d f24, t2, 0 + fldx.d f25, t2, a1 + fld.d f26, t3, 0 + fldx.d f27, t3, a1 + +.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 + vsllwil.hu.bu \i, \i, 0 +.endr + +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vsrari.h \i, \i, 4 +.endr + + vadd.h vr20, vr20, \in0 + vadd.h vr21, vr21, \in1 + vadd.h vr22, vr22, \in2 + vadd.h vr23, vr23, \in3 + vadd.h vr24, vr24, \in4 + vadd.h vr25, vr25, \in5 + vadd.h vr26, vr26, \in6 + vadd.h vr27, vr27, \in7 + + vssrani.bu.h vr21, vr20, 0 + vssrani.bu.h vr23, vr22, 0 + vssrani.bu.h vr25, vr24, 0 + vssrani.bu.h vr27, vr26, 0 + + vstelm.d vr21, t4, 0, 0 + vstelm.d vr21, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr23, t4, 0, 0 + vstelm.d vr23, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr25, t4, 0, 0 + vstelm.d vr25, t5, 0, 1 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + vstelm.d vr27, t4, 0, 0 + vstelm.d vr27, t5, 0, 1 + +.endm // adst16_core_finish_lsx + +function inv_txfm_add_dct_adst_8x16_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx + + vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + vsrari.h \i, \i, 1 +.endr + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31 + + LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ + vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31 + + adst16_core_lsx , , + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +.macro malloc_space number + li.w t0, \number + sub.d sp, sp, t0 + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 +.endm + +.macro free_space number + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + li.w t0, \number + add.d sp, sp, t0 + addi.d sp, sp, 64 +.endm + +.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 + vsllwil.hu.bu vr10, \in0, 0 + vexth.hu.bu vr0, \in0 + vsllwil.hu.bu vr11, \in1, 0 + vexth.hu.bu vr1, \in1 + vsllwil.hu.bu vr12, \in2, 0 + vexth.hu.bu vr2, \in2 + vsllwil.hu.bu vr13, \in3, 0 + vexth.hu.bu vr3, \in3 + vadd.h vr10, vr10, \in4 + vadd.h vr0, vr0, \in5 + vadd.h vr11, vr11, \in6 + vadd.h vr1, vr1, \in7 + vadd.h vr12, vr12, \in8 + vadd.h vr2, vr2, \in9 + vadd.h vr13, vr13, \in10 + vadd.h vr3, vr3, \in11 + vssrani.bu.h vr0, vr10, 0 + vssrani.bu.h vr1, vr11, 0 + vssrani.bu.h vr2, vr12, 0 + vssrani.bu.h vr3, vr13, 0 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vst vr2, t2, 0 + vstx vr3, t2, a1 +.endm + +.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift + +.ifnb \shift +.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 + vsrari.h \i, \i, \shift +.endr +.endif + + vld vr0, a0, 0 + vldx vr1, a0, a1 + vld vr2, t2, 0 + vldx vr3, t2, a1 + DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ + \in4, \in5, \in6, \in7 +.endm + +function inv_txfm_add_dct_dct_16x8_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_16x8 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + alsl.d t2, a1, a0, 1 + vmul.w vr2, vr2, vr0 + vldx vr1, a0, a1 + vsrari.w vr2, vr2, 8 + vldx vr3, t2, a1 + vsrari.w vr2, vr2, 1 // (dc + rnd) >> shift + vmadd.w vr5, vr2, vr0 + vld vr0, a0, 0 + vssrarni.h.w vr5, vr5, 12 + vld vr2, t2, 0 + + DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + b .DCT_DCT_16x8_END + +.NO_HAS_DCONLY_16x8: + malloc_space 512 + + vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + la.local t0, idct_coeffs + + vldrepl.w vr23, t0, 0 //2896 +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + rect2_lsx \i, vr23, \i +.endr + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ + vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 1 +.endr + + vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \ + vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2 + + dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \ + vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4 + + free_space 512 + +.DCT_DCT_16x8_END: + +endfunc + +function inv_txfm_add_adst_dct_16x8_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + la.local t0, idct_coeffs + + vldrepl.w vr23, t0, 0 //2896 +.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + rect2_lsx \i, vr23, \i +.endr + + adst16_core_lsx , 1, + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \ + vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \ + vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ + vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240 + vst vr23, a2, \i +.endr + + dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \ + vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2 + + dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \ + vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2 + + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +function inv_txfm_add_dct_dct_16x16_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_16x16 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + alsl.d t2, a1, a0, 1 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vldx vr1, a0, a1 + vmadd.w vr5, vr2, vr0 + vldx vr3, t2, a1 + vssrarni.h.w vr5, vr5, 12 + vld vr0, a0, 0 + vld vr2, t2, 0 + + DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5, + + b .DCT_DCT_16x16_END + +.NO_HAS_DCONLY_16x16: + + malloc_space 512 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 + + free_space 512 + +.DCT_DCT_16x16_END: +endfunc + +function inv_txfm_add_adst_adst_16x16_8bpc_lsx + + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + free_space 256+256 +endfunc + +function inv_txfm_add_adst_dct_16x16_8bpc_lsx + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4 + + free_space 256+256 +endfunc + +function inv_txfm_add_dct_adst_16x16_8bpc_lsx + malloc_space 256+256 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15 + + free_space 256+256 +endfunc + +const shufb + .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +endconst + +function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx + malloc_space 256+256 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + addi.d t2, a2, 16 + addi.d t1, t1, 256 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx transpose8x8, 2, vst_x16 + + vreplgr2vr.h vr23, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr23, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + la.local t0, shufb + vld vr0, t0, 0 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vshuf.b \i, \i, \i, vr0 +.endr + + vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 + vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + la.local t0, shufb + vld vr0, t0, 0 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vshuf.b \i, \i, \i, vr0 +.endr + + alsl.d t2, a1, a0, 1 + vld vr4, sp, 64 + vld vr5, sp, 80 + vld vr6, sp, 96 + vld vr7, sp, 112 + VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 128 + vld vr5, sp, 144 + vld vr6, sp, 160 + vld vr7, sp, 176 + VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 320 + vld vr5, sp, 336 + vld vr6, sp, 352 + vld vr7, sp, 368 + VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4 + + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + vld vr4, sp, 384 + vld vr5, sp, 400 + vld vr6, sp, 416 + vld vr7, sp, 432 + VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4 + + free_space 256+256 +endfunc + +function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx + malloc_space 256+256 + + vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + +.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + vsrari.h \i, \i, 2 +.endr + + vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + // out0 out1 out2 out3 out4 out5 out6 out7 + // vr14 vr18 vr2 vr5 vr7 vr4 vr8 vr10 + // out8 out9 out10 out11 out12 out13 out14 out15 + // vr1 vr0 vr19 vr17 vr3 vr9 vr13 vr15 + + la.local t0, shufb + vld vr31, t0, 0 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 + + addi.d t2, sp, 64+128 + + vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + + adst16_core_lsx , , + + addi.d a0, a0, 8 + + la.local t0, shufb + vld vr31, t0, 0 + + addi.d t2, a0, 0 + alsl.d t3, a1, a0, 1 + addi.d t4, a0, 0 + add.d t5, a1, a0 + + adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1 + + alsl.d t2, a1, t2, 2 + alsl.d t3, a1, t3, 2 + + alsl.d t4, a1, t4, 1 + alsl.d t5, a1, t5, 1 + + adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14 + + free_space 256+256 + +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_8x32 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr5, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 + alsl.d t2, a1, a0, 1 + vmadd.w vr5, vr2, vr0 + vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 + vssrarni.h.w vr5, vr5, 12 + vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 + + DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 + +.rept 7 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, a0, 1 + + VLD_DST_ADD_W8 vr5, vr5, vr5, vr5 +.endr + + b .DCT_DCT_8X32_END + +.NO_HAS_DCONLY_8x32: + malloc_space 512 + + vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2 + +.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + vsrari.h \i, \i, 2 +.endr + + LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vreplgr2vr.h vr31, zero + +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \ + 464, 480, 496 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + addi.d t3, sp, 64 + + vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + // in17 in19 in21 in23 in25 in27 in29 in31 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + + vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 + vssrarni.h.w vr9, vr8, 12 // t31a + vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 + vssrarni.h.w vr10, vr11, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0 + vssrarni.h.w vr0, vr11, 12 // t30a + vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 + vssrarni.h.w vr30, vr11, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 + vssrarni.h.w vr7, vr8, 12 // t29a + vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19 + vssrarni.h.w vr19, vr8, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 + vssrarni.h.w vr4, vr8, 12 // t28a + vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26 + vssrarni.h.w vr26, vr8, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 + vssrarni.h.w vr3, vr8, 12 // t27a + vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27 + vssrarni.h.w vr27, vr8, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 + vssrarni.h.w vr2, vr8, 12 // t26a + vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28 + vssrarni.h.w vr28, vr8, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 + vssrarni.h.w vr5, vr8, 12 // t25a + vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25 + vssrarni.h.w vr25, vr8, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 + vssrarni.h.w vr6, vr8, 12 // t24a + vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24 + vssrarni.h.w vr24, vr8, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vssrarni.h.w vr7, vr4, 12 // t30a + vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0 + vssrarni.h.w vr0, vr4, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vssrarni.h.w vr9, vr4, 12 // t18a + vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2 + vssrarni.h.w vr2, vr4, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vssrarni.h.w vr29, vr4, 12 // t26a + vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6 + vssrarni.h.w vr6, vr4, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vssrarni.h.w vr8, vr4, 12 // t22a + vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24 + vssrarni.h.w vr24, vr4, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vssrarni.h.w vr5, vr3, 12 // t29a + vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2 + vssrarni.h.w vr2, vr3, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vssrarni.h.w vr7, vr3, 12 // t28 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24 + vssrarni.h.w vr24, vr3, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vssrarni.h.w vr28, vr3, 12 // t20 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25 + vssrarni.h.w vr25, vr3, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vssrarni.h.w vr30, vr3, 12 // t21a + vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1 + vssrarni.h.w vr1, vr3, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vssrarni.h.w vr5, vr1, 12 // t20 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7 + vssrarni.h.w vr7, vr1, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vssrarni.h.w vr25, vr1, 12 // t21a + vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6 + vssrarni.h.w vr6, vr1, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vssrarni.h.w vr19, vr1, 12 // t22 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10 + vssrarni.h.w vr10, vr1, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vssrarni.h.w vr31, vr1, 12 // t23a + vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8 + vssrarni.h.w vr8, vr1, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, 4 +.endr + + vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, 4 +.endr + + vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + alsl.d t2, a1, a0, 1 + addi.d t3, sp, 64 + + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+256 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+384 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, sp, 64+128 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + addi.d t3, t3, 64 + alsl.d a0, a1, a0, 2 + alsl.d t2, a1, t2, 2 + vld vr4, t3, 0 + vld vr5, t3, 16 + vld vr6, t3, 32 + vld vr7, t3, 48 + VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 + + free_space 512 +.DCT_DCT_8X32_END: +endfunc + +.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \ + vst_start3, transpose8x8, shift + + // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + // in17 in19 in21 in23 in25 in27 in29 in31 + + la.local t0, idct_coeffs + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + + vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 + vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 + vssrarni.h.w vr9, vr8, 12 // t31a + vssrarni.h.w vr10, vr11, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 + vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 + vssrarni.h.w vr0, vr8, 12 // t30a + vssrarni.h.w vr30, vr11, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 + vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 + vssrarni.h.w vr7, vr8, 12 // t29a + vssrarni.h.w vr19, vr11, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 + vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 + vssrarni.h.w vr4, vr8, 12 // t28a + vssrarni.h.w vr26, vr11, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 + vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 + vssrarni.h.w vr3, vr8, 12 // t27a + vssrarni.h.w vr27, vr11, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 + vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 + vssrarni.h.w vr2, vr8, 12 // t26a + vssrarni.h.w vr28, vr11, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 + vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 + vssrarni.h.w vr5, vr8, 12 // t25a + vssrarni.h.w vr25, vr11, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 + vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 + vssrarni.h.w vr6, vr8, 12 // t24a + vssrarni.h.w vr24, vr11, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 + vssrarni.h.w vr7, vr4, 12 // t30a + vssrarni.h.w vr0, vr11, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 + vssrarni.h.w vr9, vr4, 12 // t18a + vssrarni.h.w vr2, vr11, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 + vssrarni.h.w vr29, vr4, 12 // t26a + vssrarni.h.w vr6, vr11, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 + vssrarni.h.w vr8, vr4, 12 // t22a + vssrarni.h.w vr24, vr11, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 + vssrarni.h.w vr5, vr3, 12 // t29a + vssrarni.h.w vr2, vr11, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 + vssrarni.h.w vr7, vr3, 12 // t28 + vssrarni.h.w vr24, vr11, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 + vssrarni.h.w vr28, vr3, 12 // t20 + vssrarni.h.w vr25, vr11, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 + vssrarni.h.w vr30, vr3, 12 // t21a + vssrarni.h.w vr1, vr11, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 + vssrarni.h.w vr5, vr1, 12 // t20 + vssrarni.h.w vr7, vr11, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 + vssrarni.h.w vr25, vr1, 12 // t21a + vssrarni.h.w vr6, vr11, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 + vssrarni.h.w vr19, vr1, 12 // t22 + vssrarni.h.w vr10, vr11, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 + vssrarni.h.w vr31, vr1, 12 // t23a + vssrarni.h.w vr8, vr11, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ + vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 +.endif + +.ifnb \shift +.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 +.endm + +function inv_txfm_add_dct_dct_32x32_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_32x32 + + ld.h t2, a2, 0 // dc + vldi vr0, 0x8b5 // 181 + vreplgr2vr.w vr1, t2 + vldi vr20, 0x880 // 128 + vmul.w vr2, vr0, vr1 // dc * 181 + st.h zero, a2, 0 + add.d t0, a0, a1 + vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 + vld vr3, t0, 16 + vsrari.w vr2, vr2, 2 // (dc + rnd) >> shift + vld vr1, a0, 16 + vmadd.w vr20, vr2, vr0 + vld vr2, t0, 0 + vssrarni.h.w vr20, vr20, 12 + vld vr0, a0, 0 + + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 + +.rept 15 + alsl.d a0, a1, a0, 1 + add.d t0, a0, a1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, t0, 0 + vld vr3, t0, 16 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 +.endr + + b .DCT_DCT_32X32_END +.NO_HAS_DCONLY_32x32: + + malloc_space 2560 // 32*32*2+512 + + addi.d t1, sp, 64 + addi.d t2, a2, 0 + addi.d t3, sp, 1024 + addi.d t3, t3, 1024 + addi.d t3, t3, 64 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 + +.rept 3 + addi.d t2, t2, 16 + addi.d t1, t1, 512 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2 +.endr + + vreplgr2vr.h vr31, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 + vst vr31, a2, \i +.endr + + addi.d t2, sp, 64 + addi.d t1, sp, 64 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 + +.rept 3 + addi.d t2, t2, 16 + addi.d t1, t1, 16 + + vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x16_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ + vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 + + dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4 +.endr + + addi.d t2, sp, 64 + +.rept 16 + add.d t0, a0, a1 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, t0, 0 + vld vr3, t0, 16 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 + vadd.h vr8, vr4, vr8 + vadd.h vr9, vr0, vr9 + vadd.h vr10, vr5, vr10 + vadd.h vr11, vr1, vr11 + vadd.h vr12, vr6, vr12 + vadd.h vr13, vr2, vr13 + vadd.h vr14, vr7, vr14 + vadd.h vr15, vr3, vr15 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, t0, 0 + vst vr15, t0, 16 + + alsl.d a0, a1, a0, 1 + addi.d t2, t2, 128 +.endr + + free_space 2560 // 32*32*2+512 + +.DCT_DCT_32X32_END: +endfunc + +.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 + + // in0 in1 in2 in3 + // dct4 in0 in2 + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vsllwil.w.h vr22, \in2, 0 + vexth.w.h vr23, \in2 + vmul.w vr8, vr22, vr20 + vmul.w vr10, vr23, vr20 + vmul.w \in2, vr22, vr21 + vmul.w vr9, vr23, vr21 + vssrarni.h.w vr10, vr8, 12 // t2 + vssrarni.h.w vr9, \in2, 12 // t3 + + vldrepl.w vr20, t0, 0 // 2896 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w vr8, vr22, vr20 + vmul.w \in2, vr23, vr20 + vssrarni.h.w \in2, vr8, 12 + + vsadd.h vr8, \in2, vr9 // c[0] + vssub.h vr9, \in2, vr9 // c[3] + vsadd.h \in0, \in2, vr10 // c[1] + vssub.h vr10, \in2, vr10 // c[2] + + // inv_dct8_1d_internal_c tx64 + // in1 in3 + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + + vsllwil.w.h vr22, \in1, 0 + vexth.w.h vr23, \in1 + vmul.w \in2, vr22, vr21 + vmul.w \in4, vr23, vr21 + vmul.w \in1, vr22, vr20 + vmul.w \in6, vr23, vr20 + vssrarni.h.w \in4, \in2, 12 // t7a + vssrarni.h.w \in6, \in1, 12 // t4a + + vldrepl.w vr20, t0, 24 // 3406 + vldrepl.w vr21, t0, 28 // 2276 + + vsllwil.w.h vr22, \in3, 0 + vexth.w.h vr23, \in3 + vneg.w vr21, vr21 + vmul.w \in2, vr22, vr20 + vmul.w \in1, vr23, vr20 + vmul.w \in3, vr22, vr21 + vmul.w \in7, vr23, vr21 + vssrarni.h.w \in1, \in2, 12 // t6a + vssrarni.h.w \in7, \in3, 12 // t5a + + vsadd.h \in3, \in6, \in7 // t4 + vssub.h \in6, \in6, \in7 // t5a + vsadd.h \in5, \in4, \in1 // t7 + vssub.h \in4, \in4, \in1 // t6a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 + vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 + vssrarni.h.w \in1, vr21, 12 // t6 + vssrarni.h.w \in7, \in2, 12 // t5 + + vsadd.h \out0, vr8, \in5 // c[0] + vssub.h \out7, vr8, \in5 // c[7] + vsadd.h \out1, \in0, \in1 // c[1] + vssub.h \out6, \in0, \in1 // c[6] + vsadd.h \out2, vr10, \in7 // c[2] + vssub.h \out5, vr10, \in7 // c[5] + vsadd.h \out3, vr9, \in3 // c[3] + vssub.h \out4, vr9, \in3 // c[4] +.endm + +.macro dct_8x16_tx64_core_lsx + dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ + vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + // in1 in3 in5 in7 in9 in11 in13 in15 + // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 32 // 401 + vldrepl.w vr21, t0, 36 // 4076 + vsllwil.w.h vr22, vr1, 0 + vexth.w.h vr23, vr1 + vmul.w vr0, vr22, vr21 + vmul.w vr10, vr23, vr21 + vmul.w vr1, vr22, vr20 + vmul.w vr29, vr23, vr20 + vssrarni.h.w vr10, vr0, 12 // t15a + vssrarni.h.w vr29, vr1, 12 // t8a + + vldrepl.w vr20, t0, 40 // 3166 -> 1583 + vldrepl.w vr21, t0, 44 // 2598 -> 1299 + vsllwil.w.h vr22, vr7, 0 + vexth.w.h vr23, vr7 + vneg.w vr21, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr30, vr23, vr20 + vmul.w vr7, vr22, vr21 + vmul.w vr31, vr23, vr21 + vssrarni.h.w vr30, vr0, 12 // t14a + vssrarni.h.w vr31, vr7, 12 // t9a + + vldrepl.w vr20, t0, 48 // 1931 + vldrepl.w vr21, t0, 52 // 3612 + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr23, vr5 + vmul.w vr0, vr22, vr21 + vmul.w vr24, vr23, vr21 + vmul.w vr5, vr22, vr20 + vmul.w vr25, vr23, vr20 + vssrarni.h.w vr24, vr0, 12 // t13a + vssrarni.h.w vr25, vr5, 12 // t10a + + vldrepl.w vr20, t0, 56 // 3920 + vldrepl.w vr21, t0, 60 // 1189 + vsllwil.w.h vr22, vr3, 0 + vexth.w.h vr23, vr3 + vneg.w vr21, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr26, vr23, vr20 + vmul.w vr3, vr22, vr21 + vmul.w vr27, vr23, vr21 + vssrarni.h.w vr26, vr0, 12 // t12a + vssrarni.h.w vr27, vr3, 12 // t11a + + // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 + vsadd.h vr28, vr29, vr31 // t8 + vssub.h vr19, vr29, vr31 // t9 + vssub.h vr29, vr27, vr25 // t10 + vsadd.h vr9, vr27, vr25 // t11 + vsadd.h vr31, vr26, vr24 // t12 + vssub.h vr25, vr26, vr24 // t13 + vssub.h vr27, vr10, vr30 // t14 + vsadd.h vr24, vr10, vr30 // t15 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 + vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 + vssrarni.h.w vr26, vr0, 12 // t14a + vssrarni.h.w vr30, vr1, 12 // t9a + + vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 + vneg.w vr0, vr0 + vneg.w vr19, vr19 + vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 + vssrarni.h.w vr19, vr0, 12 // t10a + vssrarni.h.w vr27, vr1, 12 // t13a + + vsadd.h vr25, vr28, vr9 // t8a + vssub.h vr29, vr28, vr9 // t11a + vssub.h vr28, vr24, vr31 // t12a + vsadd.h vr10, vr24, vr31 // t15a + vsadd.h vr9, vr30, vr19 // t9 + vssub.h vr31, vr30, vr19 // t10 + vssub.h vr30, vr26, vr27 // t13 + vsadd.h vr24, vr26, vr27 // t14 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 + vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 + vssrarni.h.w vr26, vr0, 12 // t13a + vssrarni.h.w vr27, vr1, 12 // t10a + + vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 + vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 + vssrarni.h.w vr31, vr0, 12 // t12 + vssrarni.h.w vr30, vr1, 12 // t11 + + // vr11 vr12 ... vr18 + vsadd.h vr28, vr14, vr31 // c[3] + vssub.h vr29, vr14, vr31 // c[12] + vsadd.h vr20, vr15, vr30 // c[4] + vssub.h vr21, vr15, vr30 // c[11] + vsadd.h vr14, vr16, vr27 // c[5] + vssub.h vr23, vr16, vr27 // c[10] + vsadd.h vr15, vr17, vr9 // c[6] + vssub.h vr30, vr17, vr9 // c[9] + vsadd.h vr16, vr18, vr25 // c[7] + vssub.h vr27, vr18, vr25 // c[8] + vsadd.h vr17, vr13, vr26 // c[2] + vssub.h vr26, vr13, vr26 // c[13] + vsadd.h vr18, vr12, vr24 // c[1] + vssub.h vr25, vr12, vr24 // c[14] + vsadd.h vr22, vr11, vr10 // c[0] + vssub.h vr24, vr11, vr10 // c[15] +.endm // dct_8x16_tx64_core_lsx + +.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 + vsllwil.w.h vr22, \in0, 0 + vexth.w.h vr23, \in0 + vmul.w \tmp0, vr22, \in1 + vmul.w \out0, vr23, \in1 + vmul.w \tmp1, vr22, \in2 + vmul.w \out1, vr23, \in2 + vssrarni.h.w \out0, \tmp0, 12 + vssrarni.h.w \out1, \tmp1, 12 +.endm + +const idct64_coeffs, align=4 + .word 101, 4095, 2967, -2824 + .word 1660, 3745, 3822, -1474 + .word 4076, 401, 4017, 799 + + .word 4036, -700, 2359, 3349 + .word 3461, -2191, 897, 3996 + .word -3166, -2598, -799, -4017 + + .word 501, 4065, 3229, -2520 + .word 2019, 3564, 3948, -1092 + .word 3612, 1931, 2276, 3406 + + .word 4085, -301, 2675, 3102 + .word 3659, -1842, 1285, 3889 + .word -3920, -1189, -3406, -2276 +endconst + +// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a +// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a +// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a +// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + +.macro dct64_step1_lsx + + vldrepl.w vr20, t0, 0 // 101 + vldrepl.w vr21, t0, 4 // 4095 + vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a + + vldrepl.w vr20, t0, 8 // 2967 + vldrepl.w vr21, t0, 12 // -2824 + vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a + + vldrepl.w vr20, t0, 16 // 1660 + vldrepl.w vr21, t0, 20 // 3745 + vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a + + vldrepl.w vr20, t0, 24 // 3822 + vldrepl.w vr21, t0, 28 // -1474 + vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a + + vsadd.h vr0, vr8, vr11 // t32 + vssub.h vr1, vr8, vr11 // t33 + vssub.h vr2, vr15, vr12 // t34 + vsadd.h vr3, vr15, vr12 // t35 + vsadd.h vr4, vr14, vr13 // t60 + vssub.h vr5, vr14, vr13 // t61 + vssub.h vr6, vr9, vr10 // t62 + vsadd.h vr7, vr9, vr10 // t63 + + vldrepl.w vr20, t0, 32 // 4076 + vldrepl.w vr21, t0, 36 // 401 + vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 + vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 + vssrarni.h.w vr10, vr9, 12 // t62a + vssrarni.h.w vr11, vr13, 12 // t33a + + vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 + vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 + vneg.w vr9, vr9 + vneg.w vr1, vr1 + vssrarni.h.w vr6, vr13, 12 // t61a + vssrarni.h.w vr1, vr9, 12 // t34a + + vsadd.h vr2, vr0, vr3 // t32a + vssub.h vr5, vr0, vr3 // t35a + vsadd.h vr9, vr11, vr1 // t33 + vssub.h vr13, vr11, vr1 // t34 + vssub.h vr0, vr7, vr4 // t60a + vsadd.h vr3, vr7, vr4 // t63a + vssub.h vr1, vr10, vr6 // t61 + vsadd.h vr11, vr10, vr6 // t62 + + vldrepl.w vr20, t0, 40 // 4017 + vldrepl.w vr21, t0, 44 // 799 + + vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 + vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 + vssrarni.h.w vr4, vr8, 12 // t61a + vssrarni.h.w vr7, vr12, 12 // t34a + + vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 + vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 + vssrarni.h.w vr6, vr8, 12 // t60 + vssrarni.h.w vr10, vr12, 12 // t35 + + vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 +.endm // dct64_step1 + + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a +.macro dct64_step2_lsx + vld vr0, t5, 0 // t32a + vld vr2, t4, 0 // t63a + vld vr3, t5, 16*8 // t56a + vld vr1, t4, 16*8 // t39a + vld vr4, t5, 16*16 // t40a + vld vr6, t4, 16*16 // t55a + vld vr7, t5, 16*24 // t48a + vld vr5, t4, 16*24 // t47a + + vsadd.h vr8, vr0, vr1 // t32 + vssub.h vr9, vr0, vr1 // t39 + vsadd.h vr10, vr2, vr3 // t63 + vssub.h vr11, vr2, vr3 // t56 + vssub.h vr12, vr5, vr4 // t40 + vsadd.h vr13, vr5, vr4 // t47 + vsadd.h vr14, vr7, vr6 // t48 + vssub.h vr15, vr7, vr6 // t55 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 + vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 + vssrarni.h.w vr2, vr0, 12 // t56a + vssrarni.h.w vr3, vr1, 12 // t39a + + vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 + vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 + vneg.w vr0, vr0 + vneg.w vr4, vr4 + vssrarni.h.w vr5, vr1, 12 // t55a + vssrarni.h.w vr4, vr0, 12 // t40a + + vsadd.h vr9, vr8, vr13 // t32a + vssub.h vr11, vr8, vr13 // t47a + vsadd.h vr6, vr3, vr4 // t39 + vssub.h vr7, vr3, vr4 // t40 + vssub.h vr12, vr10, vr14 // t48a + vsadd.h vr15, vr10, vr14 // t63a + vssub.h vr0, vr2, vr5 // t55 + vsadd.h vr1, vr2, vr5 // t56 + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 + vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 + vssrarni.h.w vr13, vr8, 12 // t40a + vssrarni.h.w vr4, vr3, 12 // t55a + vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 + vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 + vssrarni.h.w vr10, vr8, 12 // t47 + vssrarni.h.w vr14, vr3, 12 // t48 + + // t32a t39 t40a t47 t48 t55a t56 t63a + // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 + vst vr9, t5, 0 // t32a + vst vr6, t4, 0 // t39 + vst vr13, t5, 16*8 // t40a + vst vr10, t4, 16*8 // t47 + vst vr14, t5, 16*16 // t48 + vst vr4, t4, 16*16 // t55a + vst vr1, t5, 16*24 // t56 + vst vr15, t4, 16*24 // t63a +.endm // dct64_step2_lsx + +.macro dct64_step3_lsx + // t0 t1 t2 t3 t4 t5 t6 t7 + vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 + + vld vr9, t5, 16*24 // t56 + vld vr6, t5, 16*24+16 // t57a + vld vr13, t5, 16*24+32 // t58 + vld vr10, t5, 16*24+48 // t59a + vld vr14, t4, 16*24-48 // t60 + vld vr4, t4, 16*24-32 // t61a + vld vr1, t4, 16*24-16 // t62 + vld vr15, t4, 16*24 // t63a + + vsadd.h vr20, vr2, vr15 // c[0] + vssub.h vr21, vr2, vr15 // c[63] + vsadd.h vr22, vr3, vr1 // c[1] + vssub.h vr23, vr3, vr1 // c[62] + vsadd.h vr24, vr7, vr4 // c[2] + vssub.h vr25, vr7, vr4 // c[61] + vsadd.h vr26, vr8, vr14 // c[3] + vssub.h vr27, vr8, vr14 // c[60] + + vsadd.h vr28, vr11, vr10 // c[4] + vssub.h vr29, vr11, vr10 // c[59] + vsadd.h vr30, vr12, vr13 // c[5] + vssub.h vr31, vr12, vr13 // c[58] + vsadd.h vr2, vr16, vr6 // c[6] + vssub.h vr15, vr16, vr6 // c[57] + vsadd.h vr1, vr17, vr9 // c[7] + vssub.h vr3, vr17, vr9 // c[56] +.endm // dct64_step3_lsx + +.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 + + dct64_step3_lsx + +.ifnb \transpose8x8 + LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 + + LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ + vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ + vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 +.endif + +.ifnb \shift +.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ + vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + vsrari.h \i, \i, \shift +.endr +.endif + + vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + +.endm // dct64_step4_lsx + +.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 + + fld.d f4, t0, 0 + fldx.d f5, t0, a1 + fld.d f6, t6, 0 + fldx.d f7, t6, a1 + alsl.d t0, a1, t0, 2 + alsl.d t6, a1, t6, 2 + fld.d f8, t0, 0 + fldx.d f9, t0, a1 + fld.d f10, t6, 0 + fldx.d f11, t6, a1 + +.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 + vsllwil.hu.bu \i, \i, 0 +.endr + + vsrari.h vr20, \in0, 4 + vsrari.h vr22, \in1, 4 + vsrari.h vr24, \in2, 4 + vsrari.h vr26, \in3, 4 + vsrari.h vr28, \in4, 4 + vsrari.h vr30, \in5, 4 + vsrari.h vr2, \in6, 4 + vsrari.h vr1, \in7, 4 + + vadd.h vr4, vr4, vr20 + vadd.h vr5, vr5, vr22 + vadd.h vr6, vr6, vr24 + vadd.h vr7, vr7, vr26 + vadd.h vr8, vr8, vr28 + vadd.h vr9, vr9, vr30 + vadd.h vr10, vr10, vr2 + vadd.h vr11, vr11, vr1 + + vssrani.bu.h vr5, vr4, 0 + vssrani.bu.h vr7, vr6, 0 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + + vstelm.d vr5, t1, 0, 0 + vstelm.d vr5, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr7, t1, 0, 0 + vstelm.d vr7, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr9, t1, 0, 0 + vstelm.d vr9, t2, 0, 1 + + alsl.d t1, a1, t1, 1 + alsl.d t2, a1, t2, 1 + vstelm.d vr11, t1, 0, 0 + vstelm.d vr11, t2, 0, 1 +.endm // dct64_step5_lsx + +.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1 + vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + dct_8x16_tx64_core_lsx + + vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ + vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 + + vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + + la.local t0, idct_coeffs + + vldrepl.w vr20, t0, 64 // 201 + vldrepl.w vr21, t0, 68 // 4091 + vsllwil.w.h vr22, vr0, 0 + vexth.w.h vr23, vr0 + vmul.w vr8, vr22, vr21 + vmul.w vr9, vr23, vr21 + vmul.w vr0, vr22, vr20 + vmul.w vr10, vr23, vr20 + vssrarni.h.w vr9, vr8, 12 // t31a + vssrarni.h.w vr10, vr0, 12 // t16a + + vldrepl.w vr20, t0, 72 // 3035 + vldrepl.w vr21, t0, 76 // 2751 + vsllwil.w.h vr22, vr7, 0 + vexth.w.h vr23, vr7 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr0, vr23, vr20 + vmul.w vr7, vr22, vr21 + vmul.w vr30, vr23, vr21 + vssrarni.h.w vr0, vr8, 12 // t30a + vssrarni.h.w vr30, vr7, 12 // t17a + + vldrepl.w vr20, t0, 80 // 1751 + vldrepl.w vr21, t0, 84 // 3703 + vsllwil.w.h vr22, vr4, 0 + vexth.w.h vr23, vr4 + vmul.w vr8, vr22, vr21 + vmul.w vr7, vr23, vr21 + vmul.w vr4, vr22, vr20 + vmul.w vr19, vr23, vr20 + vssrarni.h.w vr7, vr8, 12 // t29a + vssrarni.h.w vr19, vr4, 12 // t18a + + vldrepl.w vr20, t0, 88 // 3857 + vldrepl.w vr21, t0, 92 // 1380 + vsllwil.w.h vr22, vr3, 0 + vexth.w.h vr23, vr3 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr4, vr23, vr20 + vmul.w vr3, vr22, vr21 + vmul.w vr26, vr23, vr21 + vssrarni.h.w vr4, vr8, 12 // t28a + vssrarni.h.w vr26, vr3, 12 // t19a + + vldrepl.w vr20, t0, 96 // 995 + vldrepl.w vr21, t0, 100 // 3973 + vsllwil.w.h vr22, vr2, 0 + vexth.w.h vr23, vr2 + vmul.w vr8, vr22, vr21 + vmul.w vr3, vr23, vr21 + vmul.w vr2, vr22, vr20 + vmul.w vr27, vr23, vr20 + vssrarni.h.w vr3, vr8, 12 // t27a + vssrarni.h.w vr27, vr2, 12 // t20a + + vldrepl.w vr20, t0, 104 // 3513 + vldrepl.w vr21, t0, 108 // 2106 + vsllwil.w.h vr22, vr5, 0 + vexth.w.h vr23, vr5 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr2, vr23, vr20 + vmul.w vr5, vr22, vr21 + vmul.w vr28, vr23, vr21 + vssrarni.h.w vr2, vr8, 12 // t26a + vssrarni.h.w vr28, vr5, 12 // t21a + + vldrepl.w vr20, t0, 112 // 2440 -> 1220 + vldrepl.w vr21, t0, 116 // 3290 -> 1645 + vsllwil.w.h vr22, vr6, 0 + vexth.w.h vr23, vr6 + vmul.w vr8, vr22, vr21 + vmul.w vr5, vr23, vr21 + vmul.w vr6, vr22, vr20 + vmul.w vr25, vr23, vr20 + vssrarni.h.w vr5, vr8, 12 // t25a + vssrarni.h.w vr25, vr6, 12 // t22a + + vldrepl.w vr20, t0, 120 // 4052 + vldrepl.w vr21, t0, 124 // 601 + vsllwil.w.h vr22, vr1, 0 + vexth.w.h vr23, vr1 + vneg.w vr21, vr21 + vmul.w vr8, vr22, vr20 + vmul.w vr6, vr23, vr20 + vmul.w vr1, vr22, vr21 + vmul.w vr24, vr23, vr21 + vssrarni.h.w vr6, vr8, 12 // t24a + vssrarni.h.w vr24, vr1, 12 // t23a + + vsadd.h vr1, vr10, vr30 // t16 + vssub.h vr29, vr10, vr30 // t17 + vssub.h vr8, vr26, vr19 // t18 + vsadd.h vr31, vr26, vr19 // t19 + vsadd.h vr10, vr27, vr28 // t20 + vssub.h vr30, vr27, vr28 // t21 + vssub.h vr19, vr24, vr25 // t22 + vsadd.h vr26, vr24, vr25 // t23 + vsadd.h vr27, vr6, vr5 // t24 + vssub.h vr28, vr6, vr5 // t25 + vssub.h vr24, vr3, vr2 // t26 + vsadd.h vr25, vr3, vr2 // t27 + vsadd.h vr5, vr4, vr7 // t28 + vssub.h vr6, vr4, vr7 // t29 + vssub.h vr2, vr9, vr0 // t30 + vsadd.h vr3, vr9, vr0 // t31 + + vldrepl.w vr20, t0, 16 // 799 + vldrepl.w vr21, t0, 20 // 4017 + vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 + vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 + vssrarni.h.w vr7, vr4, 12 // t30a + vssrarni.h.w vr0, vr11, 12 // t17a + vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 + vneg.w vr4, vr4 + vneg.w vr9, vr9 + vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 + vssrarni.h.w vr9, vr4, 12 // t18a + vssrarni.h.w vr2, vr11, 12 // t29a + + vldrepl.w vr20, t0, 24 // 3406 -> 1703 + vldrepl.w vr21, t0, 28 // 2276 -> 1138 + vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 + vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 + vssrarni.h.w vr29, vr4, 12 // t26a + vssrarni.h.w vr6, vr11, 12 // t21a + + vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 + vneg.w vr4, vr4 + vneg.w vr8, vr8 + vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 + vssrarni.h.w vr8, vr4, 12 // t22a + vssrarni.h.w vr24, vr11, 12 // t25a + + vsadd.h vr4, vr1, vr31 // t16a + vssub.h vr30, vr1, vr31 // t19a + vsadd.h vr19, vr0, vr9 // t17 + vssub.h vr28, vr0, vr9 // t18 + vssub.h vr1, vr26, vr10 // t20a + vsadd.h vr31, vr26, vr10 // t23a + vssub.h vr0, vr8, vr6 // t21 + vsadd.h vr9, vr8, vr6 // t22 + vsadd.h vr10, vr27, vr25 // t24a + vssub.h vr26, vr27, vr25 // t27a + vsadd.h vr6, vr24, vr29 // t25 + vssub.h vr8, vr24, vr29 // t26 + vssub.h vr25, vr3, vr5 // t28a + vsadd.h vr27, vr3, vr5 // t31a + vssub.h vr24, vr7, vr2 // t29 + vsadd.h vr29, vr7, vr2 // t30 + + vldrepl.w vr20, t0, 8 // 1567 + vldrepl.w vr21, t0, 12 // 3784 + vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 + vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 + vssrarni.h.w vr5, vr3, 12 // t29a + vssrarni.h.w vr2, vr11, 12 // 18a + + vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 + vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 + vssrarni.h.w vr7, vr3, 12 // t28 + vssrarni.h.w vr24, vr11, 12 // t19 + + vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 + vneg.w vr3, vr3 + vneg.w vr28, vr28 + vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 + vssrarni.h.w vr28, vr3, 12 // t20 + vssrarni.h.w vr25, vr11, 12 // t27 + + vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 + vneg.w vr3, vr3 + vneg.w vr30, vr30 + vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 + vssrarni.h.w vr30, vr3, 12 // t21a + vssrarni.h.w vr1, vr11, 12 // t26a + + vsadd.h vr3, vr4, vr31 // t16 + vssub.h vr26, vr4, vr31 // t23 + vsadd.h vr0, vr19, vr9 // t17a + vssub.h vr8, vr19, vr9 // t22a + vsadd.h vr4, vr2, vr30 // t18 + vssub.h vr31, vr2, vr30 // t21 + vsadd.h vr9, vr24, vr28 // t19a + vssub.h vr19, vr24, vr28 // t20a + vssub.h vr2, vr27, vr10 // t24 + vsadd.h vr30, vr27, vr10 // t31 + vssub.h vr24, vr29, vr6 // t25a + vsadd.h vr28, vr29, vr6 // t30a + vssub.h vr10, vr5, vr1 // t26 + vsadd.h vr27, vr5, vr1 // t29 + vssub.h vr6, vr7, vr25 // t27a + vsadd.h vr29, vr7, vr25 // t28a + + vldrepl.w vr20, t0, 0 // 2896 + vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 + vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 + vssrarni.h.w vr5, vr1, 12 // t20 + vssrarni.h.w vr7, vr11, 12 // t27 + + vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 + vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 + vssrarni.h.w vr25, vr1, 12 // t21a + vssrarni.h.w vr6, vr11, 12 // t26a + + vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 + vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 + vssrarni.h.w vr19, vr1, 12 // t22 + vssrarni.h.w vr10, vr11, 12 // t25 + + vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 + vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 + vssrarni.h.w vr31, vr1, 12 // t23a + vssrarni.h.w vr8, vr11, 12 // t24a + + // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 + // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 + + vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr30 // c[0] + vssub.h vr2, vr11, vr30 // c[31] + vsadd.h vr24, vr12, vr28 // c[1] + vssub.h vr26, vr12, vr28 // c[30] + vsadd.h vr11, vr13, vr27 // c[2] + vssub.h vr30, vr13, vr27 // c[29] + vsadd.h vr12, vr14, vr29 // c[3] + vssub.h vr28, vr14, vr29 // c[28] + vsadd.h vr13, vr15, vr7 // c[4] + vssub.h vr27, vr15, vr7 // c[27] + vsadd.h vr14, vr16, vr6 // c[5] + vssub.h vr29, vr16, vr6 // c[26] + vsadd.h vr7, vr17, vr10 // c[6] + vssub.h vr15, vr17, vr10 // c[25] + vsadd.h vr6, vr18, vr8 // c[7] + vssub.h vr16, vr18, vr8 // c[24] + + vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 + + vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 + + vsadd.h vr1, vr11, vr31 // c[8] + vssub.h vr2, vr11, vr31 // c[23] + vsadd.h vr24, vr12, vr19 // c[9] + vssub.h vr26, vr12, vr19 // c[22] + vsadd.h vr11, vr13, vr25 // c[10] + vssub.h vr30, vr13, vr25 // c[21] + vsadd.h vr12, vr14, vr5 // c[11] + vssub.h vr28, vr14, vr5 // c[20] + vsadd.h vr13, vr15, vr9 // c[12] + vssub.h vr27, vr15, vr9 // c[19] + vsadd.h vr14, vr16, vr4 // c[13] + vssub.h vr29, vr16, vr4 // c[18] + vsadd.h vr7, vr17, vr0 // c[14] + vssub.h vr15, vr17, vr0 // c[17] + vsadd.h vr6, vr18, vr3 // c[15] + vssub.h vr16, vr18, vr3 // c[16] + + vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 + + vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 +.endm // dct_8x32_tx64_new_lsx + +function inv_txfm_add_dct_dct_64x64_8bpc_lsx + bnez a3, .NO_HAS_DCONLY_64x64 + + ld.h t2, a2, 0 + vldi vr0, 0x8b5 + vreplgr2vr.w vr1, t2 + vldi vr20, 0x880 + vmul.w vr2, vr0, vr1 + st.h zero, a2, 0 + vsrari.w vr2, vr2, 8 + vld vr3, a0, 48 + vsrari.w vr2, vr2, 2 + vld vr1, a0, 16 + vmadd.w vr20, vr2, vr0 + vld vr2, a0, 32 + vssrarni.h.w vr20, vr20, 12 + vld vr0, a0, 0 + + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, a0, 32 + vst vr15, a0, 48 + +.rept 63 + add.d a0, a0, a1 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr0, vr0 + vexth.hu.bu vr1, vr1 + vexth.hu.bu vr2, vr2 + vexth.hu.bu vr3, vr3 + vadd.h vr8, vr4, vr20 + vadd.h vr9, vr0, vr20 + vadd.h vr10, vr5, vr20 + vadd.h vr11, vr1, vr20 + vadd.h vr12, vr6, vr20 + vadd.h vr13, vr2, vr20 + vadd.h vr14, vr7, vr20 + vadd.h vr15, vr3, vr20 + vssrani.bu.h vr9, vr8, 0 + vssrani.bu.h vr11, vr10, 0 + vssrani.bu.h vr13, vr12, 0 + vssrani.bu.h vr15, vr14, 0 + vst vr9, a0, 0 + vst vr11, a0, 16 + vst vr13, a0, 32 + vst vr15, a0, 48 +.endr + b .DCT_DCT_64X64_END +.NO_HAS_DCONLY_64x64: + + malloc_space 64*32*2+512+512 + + addi.d t7, sp, 64 + +.macro dct64x64_core1_lsx in0, in1, in2 + addi.d t2, a2, \in0 + addi.d t7, t7, \in1 + li.w t4, 64*32*2+64 + add.d t3, sp, t4 + addi.d t6, t3, 512 + add.d t5, t6, zero + + dct_8x32_tx64_new_lsx 0, 256, 128, 256 + + la.local t0, idct64_coeffs + + addi.d t2, a2, \in2 // 32 ... + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + vld vr0, t2, 128*0 // in1 + vld vr1, t2, 128*15 // in31 + vld vr2, t2, 128*8 // in17 + vld vr3, t2, 128*7 // in15 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + vld vr0, t2, 128*3 // in7 + vld vr1, t2, 128*12 // in25 + vld vr2, t2, 128*11 // in23 + vld vr3, t2, 128*4 // in9 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + vld vr0, t2, 128*2 // in5 + vld vr1, t2, 128*13 // in27 + vld vr2, t2, 128*10 // in21 + vld vr3, t2, 128*5 // in11 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vld vr0, t2, 128*1 // in3 + vld vr1, t2, 128*14 // in29 + vld vr2, t2, 128*9 // in19 + vld vr3, t2, 128*6 // in13 + dct64_step1_lsx + + la.local t0, idct_coeffs + addi.d t4, t5, 16*7 + // t32a/t39/t40a/t47/t48/t55a/t56/t63a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t33/t38a/t41/t46a/t49a/t54/t57a/t62 + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t34a/t37/t42a/t45/t50/t53a/t58/t61a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t35/t36a/t43/t44a/t51a/t52/t59a/t60 + dct64_step2_lsx + + li.w t4, 64*32*2+64+512 + add.d t5, t4, sp + addi.d t4, t5, 16*7 + dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128 + + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128 + + addi.d t5, t5, -16*8 + addi.d t4, t4, -16*8 + addi.d t3, t3, 128 + dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128 + + addi.d t5, t5, -16*8 + addi.d t4, t4, -16*8 + addi.d t3, t3, 128 + dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128 +.endm + + dct64x64_core1_lsx 0, 0, 64 + + dct64x64_core1_lsx 16, 128*8, 64+16 + + dct64x64_core1_lsx 32, 128*8, 64+16*2 + + dct64x64_core1_lsx 48, 128*8, 64+16*3 + + vreplgr2vr.h vr31, zero +.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032 + vst vr31, a2, \i +.endr + +.macro dct64x64_core2_lsx in0, in1 + addi.d t2, sp, 64+\in0 + addi.d t7, sp, 64+\in0 + li.w t4, 64*32*2+64 + add.d t3, sp, t4 + addi.d t6, t3, 512 + add.d t5, t6, zero + + addi.d t2, t2, 1024 + addi.d t2, t2, 1024 + dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512 + + la.local t0, idct64_coeffs + + addi.d t2, sp, 64+64*2+\in0 + addi.d t4, t2, 256*7 + addi.d t4, t4, 256 + + vld vr0, t2, 256*0 // in1 + vld vr1, t4, 256*7 // in31 + vld vr2, t4, 256*0 // in17 + vld vr3, t2, 256*7 // in15 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*3 // in7 + vld vr1, t4, 256*4 // in25 + vld vr2, t4, 256*3 // in23 + vld vr3, t2, 256*4 // in9 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*2 // in5 + vld vr1, t4, 256*5 // in27 + vld vr2, t4, 256*2 // in21 + vld vr3, t2, 256*5 // in11 + dct64_step1_lsx + + addi.d t0, t0, 48 + addi.d t6, t6, 128 + vld vr0, t2, 256*1 // in3 + vld vr1, t4, 256*6 // in29 + vld vr2, t4, 256*1 // in19 + vld vr3, t2, 256*6 // in13 + dct64_step1_lsx + + la.local t0, idct_coeffs + addi.d t4, t5, 16*7 + // t32a/t39/t40a/t47/t48/t55a/t56/t63a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t33/t38a/t41/t46a/t49a/t54/t57a/t62 + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t34a/t37/t42a/t45/t50/t53a/t58/t61a + dct64_step2_lsx + + addi.d t5, t5, 16 + addi.d t4, t4, -16 + // t35/t36a/t43/t44a/t51a/t52/t59a/t60 + dct64_step2_lsx + + li.w t4, 64*32*2+64+512 + add.d t5, t4, sp + addi.d t4, t5, 16*7 + addi.d a0, a0, \in1 + // 0 - 7, 56 -63 + dct64_step3_lsx + + li.w t8, 0 + mul.w t0, t8, a1 + add.d t0, a0, t0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 56 + mul.w t0, t8, a1 + add.d t0, a0, t0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 8 - 15, 48 - 55 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 8 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 48 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 16 - 23, 40 - 47 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 16 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 40 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 + + // 24 - 31, 32 - 39 + addi.d t3, t3, 128 + addi.d t4, t4, -16*8 + addi.d t5, t5, -16*8 + dct64_step3_lsx + + li.w t8, 24 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 + + li.w t8, 32 + mul.w t0, t8, a1 + add.d t0, t0, a0 + alsl.d t6, a1, t0, 1 + addi.d t1, t0, 0 + add.d t2, t0, a1 + dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 +.endm + + dct64x64_core2_lsx 16*0, 0 + + dct64x64_core2_lsx 16*1, 8 + + dct64x64_core2_lsx 16*2, 8 + + dct64x64_core2_lsx 16*3, 8 + + dct64x64_core2_lsx 16*4, 8 + + dct64x64_core2_lsx 16*5, 8 + + dct64x64_core2_lsx 16*6, 8 + + dct64x64_core2_lsx 16*7, 8 + + free_space 64*32*2+512+512 +.DCT_DCT_64X64_END: +endfunc diff --git a/third_party/dav1d/src/loongarch/itx.h b/third_party/dav1d/src/loongarch/itx.h new file mode 100644 index 0000000000..3ad444f534 --- /dev/null +++ b/third_party/dav1d/src/loongarch/itx.h @@ -0,0 +1,195 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_ITX_H +#define DAV1D_SRC_LOONGARCH_ITX_H + +#include "src/cpu.h" +#include "src/itx.h" + +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx)); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx)); + +static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) { +#if BITDEPTH == 8 + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + if (BITDEPTH != 8 ) return; + + c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx; + c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx; + + c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx; + + c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx; + c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx; + + c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx; + c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx; + + c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx; + c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx; + + c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx; + c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx; + + c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx; + c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx; + + c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx; + + c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx; + + c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx; +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_ITX_H */ diff --git a/third_party/dav1d/src/loongarch/loongson_asm.S b/third_party/dav1d/src/loongarch/loongson_asm.S new file mode 100644 index 0000000000..a22072ba35 --- /dev/null +++ b/third_party/dav1d/src/loongarch/loongson_asm.S @@ -0,0 +1,776 @@ +/********************************************************************* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn) + * Shiyou Yin(yinshiyou-hf@loongson.cn) + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + *********************************************************************/ + +/* + * This file is a LoongArch assembly helper file and available under ISC + * license. It provides a large number of macros and alias to simplify + * writing assembly code, especially for LSX and LASX optimizations. + * + * Any one can modify it or add new features for his/her own purposes. + * Contributing a patch will be appreciated as it might be useful for + * others as well. Send patches to loongson contributor mentioned above. + * + * MAJOR version: Usage changes, incompatible with previous version. + * MINOR version: Add new macros/functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ + +#define LML_VERSION_MAJOR 0 +#define LML_VERSION_MINOR 4 +#define LML_VERSION_MICRO 0 + +#define DEFAULT_ALIGN 5 + +/* Set prefix as needed. */ +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX dav1d_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define ASM_PREF CONCAT(_,PRIVATE_PREFIX) +#else +#define ASM_PREF PRIVATE_PREFIX +#endif + +.macro function name, align=DEFAULT_ALIGN +.macro endfunc + jirl $r0, $r1, 0x0 + .size ASM_PREF\name, . - ASM_PREF\name + .purgem endfunc +.endm +.text ; +.align \align ; +.globl ASM_PREF\name ; +.type ASM_PREF\name, @function ; +ASM_PREF\name: ; +.endm + +.macro const name, align=DEFAULT_ALIGN + .macro endconst + .size \name, . - \name + .purgem endconst + .endm +.section .rodata +.align \align +\name: +.endm + +/* + *============================================================================ + * LoongArch register alias + *============================================================================ + */ + +#define a0 $a0 +#define a1 $a1 +#define a2 $a2 +#define a3 $a3 +#define a4 $a4 +#define a5 $a5 +#define a6 $a6 +#define a7 $a7 + +#define t0 $t0 +#define t1 $t1 +#define t2 $t2 +#define t3 $t3 +#define t4 $t4 +#define t5 $t5 +#define t6 $t6 +#define t7 $t7 +#define t8 $t8 + +#define s0 $s0 +#define s1 $s1 +#define s2 $s2 +#define s3 $s3 +#define s4 $s4 +#define s5 $s5 +#define s6 $s6 +#define s7 $s7 +#define s8 $s8 + +#define zero $zero +#define sp $sp +#define ra $ra + +#define fa0 $fa0 +#define fa1 $fa1 +#define fa2 $fa2 +#define fa3 $fa3 +#define fa4 $fa4 +#define fa5 $fa5 +#define fa6 $fa6 +#define fa7 $fa7 +#define ft0 $ft0 +#define ft1 $ft1 +#define ft2 $ft2 +#define ft3 $ft3 +#define ft4 $ft4 +#define ft5 $ft5 +#define ft6 $ft6 +#define ft7 $ft7 +#define ft8 $ft8 +#define ft9 $ft9 +#define ft10 $ft10 +#define ft11 $ft11 +#define ft12 $ft12 +#define ft13 $ft13 +#define ft14 $ft14 +#define ft15 $ft15 +#define fs0 $fs0 +#define fs1 $fs1 +#define fs2 $fs2 +#define fs3 $fs3 +#define fs4 $fs4 +#define fs5 $fs5 +#define fs6 $fs6 +#define fs7 $fs7 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +#define vr0 $vr0 +#define vr1 $vr1 +#define vr2 $vr2 +#define vr3 $vr3 +#define vr4 $vr4 +#define vr5 $vr5 +#define vr6 $vr6 +#define vr7 $vr7 +#define vr8 $vr8 +#define vr9 $vr9 +#define vr10 $vr10 +#define vr11 $vr11 +#define vr12 $vr12 +#define vr13 $vr13 +#define vr14 $vr14 +#define vr15 $vr15 +#define vr16 $vr16 +#define vr17 $vr17 +#define vr18 $vr18 +#define vr19 $vr19 +#define vr20 $vr20 +#define vr21 $vr21 +#define vr22 $vr22 +#define vr23 $vr23 +#define vr24 $vr24 +#define vr25 $vr25 +#define vr26 $vr26 +#define vr27 $vr27 +#define vr28 $vr28 +#define vr29 $vr29 +#define vr30 $vr30 +#define vr31 $vr31 + +#define xr0 $xr0 +#define xr1 $xr1 +#define xr2 $xr2 +#define xr3 $xr3 +#define xr4 $xr4 +#define xr5 $xr5 +#define xr6 $xr6 +#define xr7 $xr7 +#define xr8 $xr8 +#define xr9 $xr9 +#define xr10 $xr10 +#define xr11 $xr11 +#define xr12 $xr12 +#define xr13 $xr13 +#define xr14 $xr14 +#define xr15 $xr15 +#define xr16 $xr16 +#define xr17 $xr17 +#define xr18 $xr18 +#define xr19 $xr19 +#define xr20 $xr20 +#define xr21 $xr21 +#define xr22 $xr22 +#define xr23 $xr23 +#define xr24 $xr24 +#define xr25 $xr25 +#define xr26 $xr26 +#define xr27 $xr27 +#define xr28 $xr28 +#define xr29 $xr29 +#define xr30 $xr30 +#define xr31 $xr31 + +/* + *============================================================================ + * LSX/LASX synthesize instructions + *============================================================================ + */ + +/* + * Description : Dot product of byte vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - halfword + */ +.macro vdp2.h.bu vd, vj, vk + vmulwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2.h.bu.b vd, vj, vk + vmulwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2.w.h vd, vj, vk + vmulwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2.h.bu xd, xj, xk + xvmulwev.h.bu \xd, \xj, \xk + xvmaddwod.h.bu \xd, \xj, \xk +.endm + +.macro xvdp2.h.bu.b xd, xj, xk + xvmulwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2.w.h xd, xj, xk + xvmulwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Dot product & addition of halfword vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - twice size of input + */ +.macro vdp2add.h.bu vd, vj, vk + vmaddwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2add.h.bu.b vd, vj, vk + vmaddwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2add.w.h vd, vj, vk + vmaddwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2add.h.bu.b xd, xj, xk + xvmaddwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2add.w.h xd, xj, xk + xvmaddwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Range element vj[i] to vk[i] ~ vj[i] + * clip: vj > vk ? vj : vk && vj < va ? vj : va + */ +.macro vclip.h vd, vj, vk, va + vmax.h \vd, \vj, \vk + vmin.h \vd, \vd, \va +.endm + +.macro vclip.w vd, vj, vk, va + vmax.w \vd, \vj, \vk + vmin.w \vd, \vd, \va +.endm + +.macro xvclip.h xd, xj, xk, xa + xvmax.h \xd, \xj, \xk + xvmin.h \xd, \xd, \xa +.endm + +.macro xvclip.w xd, xj, xk, xa + xvmax.w \xd, \xj, \xk + xvmin.w \xd, \xd, \xa +.endm + +/* + * Description : Range element vj[i] to 0 ~ 255 + * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 + */ +.macro vclip255.h vd, vj + vmaxi.h \vd, \vj, 0 + vsat.hu \vd, \vd, 7 +.endm + +.macro vclip255.w vd, vj + vmaxi.w \vd, \vj, 0 + vsat.wu \vd, \vd, 7 +.endm + +.macro xvclip255.h xd, xj + xvmaxi.h \xd, \xj, 0 + xvsat.hu \xd, \xd, 7 +.endm + +.macro xvclip255.w xd, xj + xvmaxi.w \xd, \xj, 0 + xvsat.wu \xd, \xd, 7 +.endm + +/* + * Description : Store elements of vector + * vd : Data vector to be stroed + * rk : Address of data storage + * ra : Offset of address + * si : Index of data in vd + */ +.macro vstelmx.b vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.b \vd, \rk, 0, \si +.endm + +.macro vstelmx.h vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.h \vd, \rk, 0, \si +.endm + +.macro vstelmx.w vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.w \vd, \rk, 0, \si +.endm + +.macro vstelmx.d vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.d \vd, \rk, 0, \si +.endm + +.macro vmov xd, xj + vor.v \xd, \xj, \xj +.endm + +.macro xmov xd, xj + xvor.v \xd, \xj, \xj +.endm + +.macro xvstelmx.d xd, rk, ra, si + add.d \rk, \rk, \ra + xvstelm.d \xd, \rk, 0, \si +.endm + +/* + *============================================================================ + * LSX/LASX custom macros + *============================================================================ + */ + +/* + * Load 4 float, double, V128, v256 elements with stride. + */ +.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.s \out0, \src, 0 + fldx.s \out1, \src, \stride + fldx.s \out2, \src, \stride2 + fldx.s \out3, \src, \stride3 +.endm + +.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.d \out0, \src, 0 + fldx.d \out1, \src, \stride + fldx.d \out2, \src, \stride2 + fldx.d \out3, \src, \stride3 +.endm + +.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + vld \out0, \src, 0 + vldx \out1, \src, \stride + vldx \out2, \src, \stride2 + vldx \out3, \src, \stride3 +.endm + +.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + xvld \out0, \src, 0 + xvldx \out1, \src, \stride + xvldx \out2, \src, \stride2 + xvldx \out3, \src, \stride3 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + vilvl.h \tmp0, \in1, \in0 + vilvl.h \tmp1, \in3, \in2 + vilvl.w \out0, \tmp1, \tmp0 + vilvh.w \out2, \tmp1, \tmp0 + vilvh.d \out1, \out0, \out0 + vilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + */ +.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + vilvl.w \tmp0, \in1, \in0 + vilvh.w \out1, \in1, \in0 + vilvl.w \tmp1, \in3, \in2 + vilvh.w \out3, \in3, \in2 + + vilvl.d \out0, \tmp1, \tmp0 + vilvl.d \out2, \out3, \out1 + vilvh.d \out3, \out3, \out1 + vilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ + tmp3, tmp4, tmp5, tmp6, tmp7 + vilvl.h \tmp0, \in6, \in4 + vilvl.h \tmp1, \in7, \in5 + vilvl.h \tmp2, \in2, \in0 + vilvl.h \tmp3, \in3, \in1 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vilvh.h \tmp0, \in6, \in4 + vilvh.h \tmp1, \in7, \in5 + vilvh.h \tmp2, \in2, \in0 + vilvh.h \tmp3, \in3, \in1 + + vpickev.d \out0, \tmp4, \tmp6 + vpickod.d \out1, \tmp4, \tmp6 + vpickev.d \out2, \tmp5, \tmp7 + vpickod.d \out3, \tmp5, \tmp7 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vpickev.d \out4, \tmp4, \tmp6 + vpickod.d \out5, \tmp4, \tmp6 + vpickev.d \out6, \tmp5, \tmp7 + vpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.b \tmp0, \in2, \in0 + xvilvl.b \tmp1, \in3, \in1 + xvilvl.b \tmp2, \in6, \in4 + xvilvl.b \tmp3, \in7, \in5 + xvilvl.b \tmp4, \in10, \in8 + xvilvl.b \tmp5, \in11, \in9 + xvilvl.b \tmp6, \in14, \in12 + xvilvl.b \tmp7, \in15, \in13 + xvilvl.b \out0, \tmp1, \tmp0 + xvilvh.b \out1, \tmp1, \tmp0 + xvilvl.b \out2, \tmp3, \tmp2 + xvilvh.b \out3, \tmp3, \tmp2 + xvilvl.b \out4, \tmp5, \tmp4 + xvilvh.b \out5, \tmp5, \tmp4 + xvilvl.b \out6, \tmp7, \tmp6 + xvilvh.b \out7, \tmp7, \tmp6 + xvilvl.w \tmp0, \out2, \out0 + xvilvh.w \tmp2, \out2, \out0 + xvilvl.w \tmp4, \out3, \out1 + xvilvh.w \tmp6, \out3, \out1 + xvilvl.w \tmp1, \out6, \out4 + xvilvh.w \tmp3, \out6, \out4 + xvilvl.w \tmp5, \out7, \out5 + xvilvh.w \tmp7, \out7, \out5 + xvilvl.d \out0, \tmp1, \tmp0 + xvilvh.d \out1, \tmp1, \tmp0 + xvilvl.d \out2, \tmp3, \tmp2 + xvilvh.d \out3, \tmp3, \tmp2 + xvilvl.d \out4, \tmp5, \tmp4 + xvilvh.d \out5, \tmp5, \tmp4 + xvilvl.d \out6, \tmp7, \tmp6 + xvilvh.d \out7, \tmp7, \tmp6 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in1, \in0 + xvilvl.h \tmp1, \in3, \in2 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out2, \tmp1, \tmp0 + xvilvh.d \out1, \out0, \out0 + xvilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in2, \in0 + xvilvl.h \tmp1, \in3, \in1 + xvilvl.h \out2, \tmp1, \tmp0 + xvilvh.h \out3, \tmp1, \tmp0 + + xvilvl.d \out0, \out2, \out2 + xvilvh.d \out1, \out2, \out2 + xvilvl.d \out2, \out3, \out3 + xvilvh.d \out3, \out3, \out3 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.h \tmp0, \in6, \in4 + xvilvl.h \tmp1, \in7, \in5 + xvilvl.h \tmp2, \in2, \in0 + xvilvl.h \tmp3, \in3, \in1 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvilvh.h \tmp0, \in6, \in4 + xvilvh.h \tmp1, \in7, \in5 + xvilvh.h \tmp2, \in2, \in0 + xvilvh.h \tmp3, \in3, \in1 + + xvpickev.d \out0, \tmp4, \tmp6 + xvpickod.d \out1, \tmp4, \tmp6 + xvpickev.d \out2, \tmp5, \tmp7 + xvpickod.d \out3, \tmp5, \tmp7 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvpickev.d \out4, \tmp4, \tmp6 + xvpickod.d \out5, \tmp4, \tmp6 + xvpickev.d \out6, \tmp5, \tmp7 + xvpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 2x4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1, tmp2 + xvilvh.h \tmp1, \in0, \in1 + xvilvl.h \out1, \in0, \in1 + xvilvh.h \tmp0, \in2, \in3 + xvilvl.h \out3, \in2, \in3 + + xvilvh.w \tmp2, \out3, \out1 + xvilvl.w \out3, \out3, \out1 + + xvilvl.w \out2, \tmp0, \tmp1 + xvilvh.w \tmp1, \tmp0, \tmp1 + + xvilvh.d \out0, \out2, \out3 + xvilvl.d \out2, \out2, \out3 + xvilvh.d \out1, \tmp1, \tmp2 + xvilvl.d \out3, \tmp1, \tmp2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 + * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 + * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 + * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 + */ +.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + xvilvl.w \tmp0, \in1, \in0 + xvilvh.w \out1, \in1, \in0 + xvilvl.w \tmp1, \in3, \in2 + xvilvh.w \out3, \in3, \in2 + + xvilvl.d \out0, \tmp1, \tmp0 + xvilvl.d \out2, \out3, \out1 + xvilvh.d \out3, \out3, \out1 + xvilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * in0 : 1,2,3,4,5,6,7,8 + * in1 : 2,2,3,4,5,6,7,8 + * in2 : 3,2,3,4,5,6,7,8 + * in3 : 4,2,3,4,5,6,7,8 + * in4 : 5,2,3,4,5,6,7,8 + * in5 : 6,2,3,4,5,6,7,8 + * in6 : 7,2,3,4,5,6,7,8 + * in7 : 8,2,3,4,5,6,7,8 + * + * out0 : 1,2,3,4,5,6,7,8 + * out1 : 2,2,2,2,2,2,2,2 + * out2 : 3,3,3,3,3,3,3,3 + * out3 : 4,4,4,4,4,4,4,4 + * out4 : 5,5,5,5,5,5,5,5 + * out5 : 6,6,6,6,6,6,6,6 + * out6 : 7,7,7,7,7,7,7,7 + * out7 : 8,8,8,8,8,8,8,8 + */ +.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3 + xvilvl.w \tmp0, \in2, \in0 + xvilvl.w \tmp1, \in3, \in1 + xvilvh.w \tmp2, \in2, \in0 + xvilvh.w \tmp3, \in3, \in1 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out1, \tmp1, \tmp0 + xvilvl.w \out2, \tmp3, \tmp2 + xvilvh.w \out3, \tmp3, \tmp2 + + xvilvl.w \tmp0, \in6, \in4 + xvilvl.w \tmp1, \in7, \in5 + xvilvh.w \tmp2, \in6, \in4 + xvilvh.w \tmp3, \in7, \in5 + xvilvl.w \out4, \tmp1, \tmp0 + xvilvh.w \out5, \tmp1, \tmp0 + xvilvl.w \out6, \tmp3, \tmp2 + xvilvh.w \out7, \tmp3, \tmp2 + + xmov \tmp0, \out0 + xmov \tmp1, \out1 + xmov \tmp2, \out2 + xmov \tmp3, \out3 + xvpermi.q \out0, \out4, 0x02 + xvpermi.q \out1, \out5, 0x02 + xvpermi.q \out2, \out6, 0x02 + xvpermi.q \out3, \out7, 0x02 + xvpermi.q \out4, \tmp0, 0x31 + xvpermi.q \out5, \tmp1, 0x31 + xvpermi.q \out6, \tmp2, 0x31 + xvpermi.q \out7, \tmp3, 0x31 +.endm + +/* + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Example : LASX_TRANSPOSE4x4_D + * in0 : 1,2,3,4 + * in1 : 1,2,3,4 + * in2 : 1,2,3,4 + * in3 : 1,2,3,4 + * + * out0 : 1,1,1,1 + * out1 : 2,2,2,2 + * out2 : 3,3,3,3 + * out3 : 4,4,4,4 + */ +.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.d \tmp0, \in1, \in0 + xvilvh.d \out1, \in1, \in0 + xvilvh.d \tmp1, \in3, \in2 + xvilvl.d \out2, \in3, \in2 + + xvor.v \out0, \tmp0, \tmp0 + xvor.v \out3, \tmp1, \tmp1 + + xvpermi.q \out0, \out2, 0x02 + xvpermi.q \out2, \tmp0, 0x31 + xvpermi.q \out3, \out1, 0x31 + xvpermi.q \out1, \tmp1, 0x02 +.endm diff --git a/third_party/dav1d/src/loongarch/loopfilter.S b/third_party/dav1d/src/loongarch/loopfilter.S new file mode 100644 index 0000000000..e71d5a712e --- /dev/null +++ b/third_party/dav1d/src/loongarch/loopfilter.S @@ -0,0 +1,1108 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +.macro FILTER_W4 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -2 + fld.s f6, t5, 0 //p1 p0 q0 q1 + fldx.s f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.s f8, t5, 0 + fldx.s f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvl.h vr6, vr7, vr6 //p1p1p1p1 + vbsrl.v vr7, vr6, 4 //p0p0p0p0 + vbsrl.v vr8, vr7, 4 //q0q0q0q0 + vbsrl.v vr9, vr8, 4 //q1q1q1q1 +.else + sub.d t5, a0, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 + fld.s f8, a0, 0 + fldx.s f9, a0, a1 +.endif + + vabsd.bu vr10, vr6, vr7 // (p1 - p0) + vabsd.bu vr11, vr9, vr8 // (q1 - q0) + vabsd.bu vr12, vr7, vr8 // (p0 - q0) + vabsd.bu vr13, vr6, vr9 // (p1 - q1) + + vmax.bu vr14, vr10, vr11 + vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr12, vr12 + vsrli.b vr17, vr13, 1 + vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr16, vr16, vr3 + vand.v vr20, vr15, vr16 //fm + + vpickve2gr.wu t5, vr20, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4 + + vslt.bu vr16, vr2, vr14 //hev + + vsllwil.h.b vr30, vr20, 0 //expand fm to w + vsllwil.w.h vr30, vr30, 0 + + vsllwil.hu.bu vr17, vr6, 0 + vsllwil.hu.bu vr18, vr9, 0 + vsub.h vr17, vr17, vr18 + vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1) + + vand.v vr17, vr17, vr16 + vsllwil.h.b vr18, vr17, 0 + + vsllwil.hu.bu vr10, vr8, 0 + vsllwil.hu.bu vr11, vr7, 0 + vsub.h vr10, vr10, vr11 + + vsadd.h vr11, vr10, vr10 + vsadd.h vr10, vr10, vr11 //3 * (q0 - p0) + vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f); + vssrani.b.h vr10, vr10, 0 + vsllwil.h.b vr10, vr10, 0 + + vaddi.hu vr11, vr10, 4 + vaddi.hu vr12, vr10, 3 + li.w t5, 127 + vreplgr2vr.h vr13, t5 + vmin.h vr11, vr11, vr13 + vmin.h vr12, vr12, vr13 + vsrai.h vr11, vr11, 3 //f1 + vsrai.h vr12, vr12, 3 //f2 + + vsllwil.hu.bu vr13, vr7, 0 //p0 + vsllwil.hu.bu vr14, vr8, 0 //q0 + vsadd.h vr13, vr13, vr12 + vssub.h vr14, vr14, vr11 + vssrani.bu.h vr13, vr13, 0 //dst-1 + vssrani.bu.h vr14, vr14, 0 //dst+0 + + vsrari.h vr15, vr11, 1 //f + vsllwil.hu.bu vr18, vr6, 0 //p1 + vsllwil.hu.bu vr19, vr9, 0 //q1 + vsadd.h vr18, vr18, vr15 + vssub.h vr19, vr19, vr15 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr19, vr19, 0 //dst+1 + vbitsel.v vr26, vr18, vr6, vr16 + vbitsel.v vr29, vr19, vr9, vr16 + + vbitsel.v vr6, vr6, vr26, vr20 + vbitsel.v vr7, vr7, vr13, vr20 + vbitsel.v vr8, vr8, vr14, vr20 + vbitsel.v vr9, vr9, vr29, vr20 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr9, vr9, vr8 + vilvl.h vr6, vr9, vr6 + + addi.d t5, a0, -2 + vstelm.w vr6, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr6, t5, 0, 3 +.else + fst.s f8, a0, 0 + fstx.s f9, a0, a1 + sub.d t5, a0, a1 + fst.s f7, t5, 0 + sub.d t5, t5, a1 + fst.s f6, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W4: +.endm + +.macro FILTER_W6 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -3 + fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 + vilvl.h vr6, vr7, vr6 + + vbsrl.v vr7, vr6, 4 //p1 + vbsrl.v vr8, vr7, 4 //p0 + vbsrl.v vr9, vr8, 4 //q0 + vbsrl.v vr11, vr10, 4 //q2 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fld.d f6, t5, 0 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f10, t5, 0 + fldx.d f11, t5, a1 +.endif + + vabsd.bu vr12, vr7, vr8 //abs(p1-p0) + vabsd.bu vr13, vr10, vr9 //abs(q1-q0) + vmax.bu vr14, vr12, vr13 + vslt.bu vr2, vr2, vr14 //hev + vabsd.bu vr12, vr6, vr7 //abs(p2-p1) + vmax.bu vr12, vr12, vr14 + vabsd.bu vr13, vr11, vr10 //abs(q2-q1) + vmax.bu vr12, vr12, vr13 + vsle.bu vr0, vr12, vr4 // <=I + + vabsd.bu vr13, vr8, vr9 //abs(p0-q0) + vsadd.bu vr13, vr13, vr13 + vabsd.bu vr15, vr7, vr10 + vsrli.b vr15, vr15, 1 + vsadd.bu vr13, vr13, vr15 + vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr0, vr0, vr13 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6 + + vabsd.bu vr12, vr6, vr8 //abs(p2-p0) + vabsd.bu vr13, vr11, vr9 //abs(q2-q0) + vmax.bu vr12, vr12, vr14 + vmax.bu vr12, vr12, vr13 + vxor.v vr13, vr13, vr13 + vaddi.bu vr13, vr13, 1 + vsle.bu vr1, vr12, vr13 //flat8in + + //6789 10 11 --expand to h + vsllwil.hu.bu vr12, vr6, 0 + vsllwil.hu.bu vr13, vr7, 0 + vsllwil.hu.bu vr14, vr8, 0 + vsllwil.hu.bu vr15, vr9, 0 + vsllwil.hu.bu vr16, vr10, 0 + vsllwil.hu.bu vr17, vr11, 0 + + //dst-2 + vsadd.hu vr18, vr12, vr12 + vsadd.hu vr18, vr18, vr12 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr13 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr14 + vsadd.hu vr18, vr18, vr15 + + //dst-1 + vsadd.hu vr19, vr18, vr15 + vsadd.hu vr19, vr19, vr16 + vssub.hu vr19, vr19, vr12 + vssub.hu vr19, vr19, vr12 + + //dst+0 + vsadd.hu vr20, vr19, vr17 + vsadd.hu vr20, vr20, vr16 + vssub.hu vr20, vr20, vr12 + vssub.hu vr20, vr20, vr13 + + //dst+1 + vsadd.hu vr21, vr20, vr17 + vsadd.hu vr21, vr21, vr17 + vssub.hu vr21, vr21, vr13 + vssub.hu vr21, vr21, vr14 + + vsrari.h vr18, vr18, 3 + vsrari.h vr19, vr19, 3 + vsrari.h vr20, vr20, 3 + vsrari.h vr21, vr21, 3 + + vsub.h vr22, vr13, vr16 + vssrani.b.h vr22, vr22, 0 + vand.v vr22, vr22, vr2 + vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1); + + vsub.h vr23, vr15, vr14 + vsadd.h vr24, vr23, vr23 + vsadd.h vr23, vr23, vr24 + vsadd.h vr23, vr23, vr22 + vssrani.b.h vr23, vr23, 0 + vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr24, vr23, 4 + vaddi.hu vr25, vr23, 3 + li.w t5, 127 + vreplgr2vr.h vr3, t5 + vmin.h vr24, vr24, vr3 + vmin.h vr25, vr25, vr3 + vsrai.h vr24, vr24, 3 //f1 + vsrai.h vr25, vr25, 3 //f2 + + vsadd.h vr26, vr14, vr25 //dst-1 + vssub.h vr27, vr15, vr24 //dst+0 + + vsrari.h vr24, vr24, 1 + vsadd.h vr28, vr13, vr24 + vssub.h vr29, vr16, vr24 + vsllwil.h.b vr2, vr2, 0 + vbitsel.v vr28, vr28, vr13, vr2 //dst-2 + vbitsel.v vr29, vr29, vr16, vr2 //dst+1 + + //flat8in + vsllwil.h.b vr1, vr1, 0 + vbitsel.v vr18, vr28, vr18, vr1 + vbitsel.v vr19, vr26, vr19, vr1 + vbitsel.v vr20, vr27, vr20, vr1 + vbitsel.v vr21, vr29, vr21, vr1 + + vssrani.bu.h vr18, vr18, 0 + vssrani.bu.h vr19, vr19, 0 + vssrani.bu.h vr20, vr20, 0 + vssrani.bu.h vr21, vr21, 0 + + vbitsel.v vr7, vr7, vr18, vr0 //p1 + vbitsel.v vr8, vr8, vr19, vr0 //p0 + vbitsel.v vr9, vr9, vr20, vr0 //q0 + vbitsel.v vr10, vr10, vr21, vr0 //q1 + +.ifc \DIR, h + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.h vr7, vr9, vr7 + + addi.d t5, a0, -2 + vstelm.w vr7, t5, 0, 0 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 1 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 2 + add.d t5, t5, a1 + vstelm.w vr7, t5, 0, 3 +.else + fst.s f9, a0, 0 + fstx.s f10, a0, a1 + sub.d t5, a0, a1 + fst.s f8, t5, 0 + sub.d t5, t5, a1 + fst.s f7, t5, 0 +.endif +.END_FILTER_\DIR\()\TYPE\()_W6: +.endm + +.macro FILTER_W8 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -4 + fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3 + fldx.d f7, t5, a1 + alsl.d t5, a1, t5, 1 + fld.d f8, t5, 0 + fldx.d f9, t5, a1 + + vilvl.b vr6, vr7, vr6 + vilvl.b vr7, vr9, vr8 + vilvh.h vr10, vr7, vr6 //q0 + vilvl.h vr6, vr7, vr6 //p3 + vbsrl.v vr7, vr6, 4 //p2 + vbsrl.v vr8, vr6, 8 //p1 + vbsrl.v vr9, vr6, 12 //p0 + vbsrl.v vr11, vr10, 4 //q1 + vbsrl.v vr12, vr10, 8 //q2 + vbsrl.v vr13, vr10, 12 //q3 +.else + fld.s f10, a0, 0 + fldx.s f11, a0, a1 + add.d t5, a0, a1 + fldx.s f12, t5, a1 + add.d t5, t5, a1 + fldx.s f13, t5, a1 + sub.d t5, a0, a1 + fld.s f9, t5, 0 + sub.d t5, t5, a1 + fld.s f8, t5, 0 + sub.d t5, t5, a1 + fld.s f7, t5, 0 + sub.d t5, t5, a1 + fld.s f6, t5, 0 +.endif + + vabsd.bu vr14, vr8, vr9 //p1-p0 + vabsd.bu vr15, vr11, vr10 //q1-q0 + vabsd.bu vr16, vr9, vr10 //p0-q0 + vabsd.bu vr17, vr8, vr11 //p1-q1 + vabsd.bu vr18, vr7, vr8 //p2-p1 + vabsd.bu vr19, vr12, vr11 //q2-q1 + vabsd.bu vr20, vr6, vr7 //p3-p2 + vabsd.bu vr21, vr13, vr12 //q3-q2 + + vmax.bu vr22, vr14, vr15 + vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I + vsadd.bu vr16, vr16, vr16 + vsrli.b vr17, vr17, 1 + vsadd.bu vr16, vr16, vr17 + vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E + vand.v vr16, vr16, vr23 //fm + + vpickve2gr.wu t5, vr16, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8 + + vmax.bu vr23, vr18, vr19 + vmax.bu vr23, vr23, vr20 + vmax.bu vr23, vr23, vr21 + vsle.bu vr23, vr23, vr4 + vand.v vr16, vr16, vr23 //fm + + vabsd.bu vr17, vr7, vr9 //abs(p2-p0) + vabsd.bu vr18, vr12, vr10 //abs(q2-q0) + vmax.bu vr17, vr17, vr14 + vmax.bu vr17, vr17, vr15 + vmax.bu vr17, vr17, vr18 + vabsd.bu vr18, vr6, vr9 //abs(p3 - p0) + vabsd.bu vr19, vr13, vr10 //abs(q3 - q0) + vmax.bu vr17, vr17, vr18 + vmax.bu vr17, vr17, vr19 + + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr17, vr17, vr5 //flat8in + + vsllwil.hu.bu vr0, vr6, 0 //p3 + vsllwil.hu.bu vr1, vr7, 0 //p2 + vsllwil.hu.bu vr27, vr8, 0 //p1 + vsllwil.hu.bu vr3, vr9, 0 //p0 + vsllwil.hu.bu vr4, vr10, 0 //q0 + vsllwil.hu.bu vr5, vr11, 0 //q1 + vsllwil.hu.bu vr14, vr12, 0 //q2 + vsllwil.hu.bu vr15, vr13, 0 //q3 + + vsadd.hu vr18, vr0, vr0 //p3+p3 + vsadd.hu vr19, vr15, vr15 //q3+q3 + vsadd.hu vr20, vr0, vr1 //p3+p2 + vsadd.hu vr21, vr1, vr27 //p2+p1 + vsadd.hu vr28, vr27, vr3 //p1+p0 + vsadd.hu vr23, vr3, vr4 //p0+q0 + vsadd.hu vr24, vr4, vr5 //q0+q1 + vsadd.hu vr25, vr5, vr14 //q1+q2 + vsadd.hu vr26, vr14, vr15 //q2+q3 + + // dst-3 + vsadd.hu vr29, vr18, vr20 + vsadd.hu vr29, vr29, vr21 + vsadd.hu vr29, vr29, vr23 + + // dst-2 + vsadd.hu vr30, vr18, vr21 + vsadd.hu vr30, vr30, vr28 + vsadd.hu vr30, vr30, vr24 + + // dst-1 + vsadd.hu vr31, vr20, vr28 + vsadd.hu vr31, vr31, vr23 + vsadd.hu vr31, vr31, vr25 + + // dst+0 + vsadd.hu vr18, vr21, vr23 + vsadd.hu vr18, vr18, vr24 + vsadd.hu vr18, vr18, vr26 + + //dst+1 + vsadd.hu vr20, vr28, vr24 + vsadd.hu vr20, vr20, vr25 + vsadd.hu vr20, vr20, vr19 + + //dst+2 + vsadd.hu vr21, vr23, vr25 + vsadd.hu vr21, vr21, vr26 + vsadd.hu vr21, vr21, vr19 + + vssrarni.bu.h vr23, vr29, 3 + vssrarni.bu.h vr24, vr30, 3 + vssrarni.bu.h vr25, vr31, 3 + vssrarni.bu.h vr19, vr18, 3 + vssrarni.bu.h vr20, vr20, 3 + vssrarni.bu.h vr21, vr21, 3 + + // !flat8in + vslt.bu vr2, vr2, vr22 //hev + + vsub.h vr30, vr27, vr5 //p1-q1 + vssrani.b.h vr30, vr30, 0 + vand.v vr30, vr30, vr2 + vsllwil.h.b vr30, vr30, 0 + + vsub.h vr31, vr4, vr3 + vsadd.h vr0, vr31, vr31 + vsadd.h vr31, vr31, vr0 + vsadd.h vr31, vr31, vr30 + vssrani.b.h vr31, vr31, 0 + vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr14, vr31, 4 + vaddi.hu vr15, vr31, 3 + li.w t5, 127 + vreplgr2vr.h vr18, t5 + vmin.h vr14, vr14, vr18 + vmin.h vr15, vr15, vr18 + vsrai.h vr14, vr14, 3 //f1 + vsrai.h vr15, vr15, 3 //f2 + + vsadd.h vr3, vr3, vr15 + vssub.h vr4, vr4, vr14 + vssrani.bu.h vr3, vr3, 0 //dst-1 + vssrani.bu.h vr4, vr4, 0 //dst+0 + + vsrari.h vr14, vr14, 1 + vsadd.h vr18, vr27, vr14 + vssub.h vr26, vr5, vr14 + vssrani.bu.h vr18, vr18, 0 //dst-2 + vssrani.bu.h vr26, vr26, 0 //dst+1 + + vbitsel.v vr27, vr18, vr8, vr2 //dst-2 + vbitsel.v vr28, vr26, vr11, vr2 //dst+1 + + vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2) + vbitsel.v vr24, vr27, vr24, vr17 //dst-2 + vbitsel.v vr25, vr3, vr25, vr17 //dst-1 + vbitsel.v vr19, vr4, vr19, vr17 //dst+0 + vbitsel.v vr20, vr28, vr20, vr17 //dst+1 + vbitsel.v vr21, vr12, vr21, vr17 //dst+2 + + vbitsel.v vr7, vr7, vr23, vr16 //-3 + vbitsel.v vr8, vr8, vr24, vr16 //-2 + vbitsel.v vr9, vr9, vr25, vr16 //-1 + vbitsel.v vr10, vr10, vr19, vr16 //+0 + vbitsel.v vr11, vr11, vr20, vr16 //+1 + vbitsel.v vr12, vr12, vr21, vr16 //+2 + +.ifc \DIR, h + vilvl.b vr6, vr7, vr6 + vilvl.b vr8, vr9, vr8 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- -- + vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- -- + vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 -- + vilvh.w vr1, vr10, vr6 //-- + + addi.d t5, a0, -4 + vstelm.d vr0, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr0, t5, 0, 1 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 0 + add.d t5, t5, a1 + vstelm.d vr1, t5, 0, 1 +.else + alsl.d t5, a1, a1, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + add.d t5, t5, a1 + fstx.s f9, t5, a1 + + fst.s f10, a0, 0 + add.d t5, a0, a1 + fst.s f11, t5, 0 + fstx.s f12, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W8: +.endm + +.macro FILTER_W16 DIR, TYPE +.ifc \DIR, h + addi.d t5, a0, -7 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + vilvl.b vr10, vr7, vr6 + vilvh.b vr11, vr7, vr6 + vilvl.b vr12, vr9, vr8 + vilvh.b vr13, vr9, vr8 + vilvl.h vr6, vr12, vr10 + vilvh.h vr10, vr12, vr10 //p2--- + vilvl.h vr15, vr13, vr11 //q1--- + vilvh.h vr19, vr13, vr11 + + vbsrl.v vr7, vr6, 4 //p5--- + vbsrl.v vr8, vr6, 8 //p4--- + vbsrl.v vr9, vr6, 12 //p3--- + vbsrl.v vr12, vr10, 4 //p1--- + vbsrl.v vr13, vr10, 8 //p0--- + vbsrl.v vr14, vr10, 12 //q0--- + vbsrl.v vr16, vr15, 4 //q2--- + vbsrl.v vr17, vr15, 8 //q3--- + vbsrl.v vr18, vr15, 12 //q4--- + vbsrl.v vr20, vr19, 4 //q6--- +.else + slli.d t5, a1, 3 + sub.d t5, a0, t5 + fldx.s f6, t5, a1 //p6 + alsl.d t5, a1, t5, 1 + fld.s f7, t5, 0 //p5 + fldx.s f8, t5, a1 //p4 + alsl.d t5, a1, t5, 1 + fld.s f9, t5, 0 //p3 + fldx.s f10, t5, a1 //p2 + alsl.d t5, a1, t5, 1 + fld.s f12, t5, 0 //p1 + fldx.s f13, t5, a1 //p0 + alsl.d t5, a1, t5, 1 + fld.s f14, t5, 0 //q0 + fldx.s f15, t5, a1 //q1 + alsl.d t5, a1, t5, 1 + fld.s f16, t5, 0 //q2 + fldx.s f17, t5, a1 //q3 + alsl.d t5, a1, t5, 1 + fld.s f18, t5, 0 //q4 + fldx.s f19, t5, a1 //q5 + add.d t5, t5, a1 + fldx.s f20, t5, a1 //q6 + + //temp store + addi.d sp, sp, -96 + fst.d f7, sp, 0 + fst.d f8, sp, 8 + fst.d f9, sp, 16 + fst.d f10, sp, 24 + fst.d f12, sp, 32 + fst.d f13, sp, 40 + fst.d f14, sp, 48 + fst.d f15, sp, 56 + fst.d f16, sp, 64 + fst.d f17, sp, 72 + fst.d f18, sp, 80 + fst.d f19, sp, 88 +.endif + + vabsd.bu vr21, vr12, vr13 //abs(p1-p0) + vabsd.bu vr22, vr15, vr14 //abs(q1-q0) + vmax.bu vr0, vr21, vr22 + vslt.bu vr2, vr2, vr0 //hev + vabsd.bu vr1, vr10, vr12 //abs(p2-p1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr16, vr15 //abs(q2-q1) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr9, vr10 //abs(p3-p2) + vmax.bu vr0, vr0, vr1 + vabsd.bu vr1, vr17, vr16 //abs(q3-q2) + vmax.bu vr0, vr0, vr1 + vsle.bu vr0, vr0, vr4 //vr4 released I + vabsd.bu vr1, vr13, vr14 //abs(p0-q0) + vsadd.bu vr1, vr1, vr1 + vabsd.bu vr4, vr12, vr15 //abs(p1-q1) + vsrli.b vr4, vr4, 1 + vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) + vsle.bu vr1, vr1, vr3 //vr3 released E + vand.v vr0, vr0, vr1 //fm + + vpickve2gr.wu t5, vr0, 0 + beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16 + + vabsd.bu vr1, vr6, vr13 //abs(p6-p0) + vabsd.bu vr4, vr7, vr13 //abs(p5-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr8, vr13 //abs(p4-p0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr18, vr14 //abs(q4-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr19, vr14 //abs(q5-q0) + vmax.bu vr1, vr1, vr4 + vabsd.bu vr4, vr20, vr14 + vmax.bu vr1, vr1, vr4 + vxor.v vr5, vr5, vr5 + vaddi.bu vr5, vr5, 1 //F + vsle.bu vr1, vr1, vr5 //flat8out + + vabsd.bu vr3, vr10, vr13 //abs(p2-p0) + vmax.bu vr3, vr3, vr21 + vmax.bu vr3, vr3, vr22 + vabsd.bu vr4, vr16, vr14 //abs(q2-q0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr9, vr13 //abs(p3-p0) + vmax.bu vr3, vr3, vr4 + vabsd.bu vr4, vr17, vr14 //abs(q3-q0) + vmax.bu vr3, vr3, vr4 + vsle.bu vr3, vr3, vr5 //flatin released vr5 + + vsllwil.hu.bu vr6, vr6, 0 //p6 + vsllwil.hu.bu vr7, vr7, 0 //p5 + vsllwil.hu.bu vr8, vr8, 0 //p4 + vsllwil.hu.bu vr9, vr9, 0 //p3 + vsllwil.hu.bu vr10, vr10, 0 //p2 + vsllwil.hu.bu vr12, vr12, 0 //p1 + vsllwil.hu.bu vr13, vr13, 0 //p0 + vsllwil.hu.bu vr14, vr14, 0 //q0 + vsllwil.hu.bu vr15, vr15, 0 //q1 + vsllwil.hu.bu vr16, vr16, 0 //q2 + vsllwil.hu.bu vr17, vr17, 0 //q3 + vsllwil.hu.bu vr18, vr18, 0 //q4 + vsllwil.hu.bu vr19, vr19, 0 //q5 + vsllwil.hu.bu vr20, vr20, 0 //q6 + + //dst-6 + vslli.w vr21, vr6, 3 + vssub.hu vr21, vr21, vr6 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr7 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr8 + vsadd.hu vr21, vr21, vr9 + vsadd.hu vr21, vr21, vr10 + vsadd.hu vr21, vr21, vr12 + vsadd.hu vr21, vr21, vr13 + vsadd.hu vr21, vr21, vr14 + + //dst-5 + vsadd.hu vr22, vr21, vr15 + vsadd.hu vr22, vr22, vr9 + vssub.hu vr22, vr22, vr6 + vssub.hu vr22, vr22, vr6 + + //dst-4 + vsadd.hu vr23, vr22, vr16 + vsadd.hu vr23, vr23, vr10 + vssub.hu vr23, vr23, vr7 + vssub.hu vr23, vr23, vr6 + + //dst-3 + vsadd.hu vr24, vr23, vr12 + vsadd.hu vr24, vr24, vr17 + vssub.hu vr24, vr24, vr6 + vssub.hu vr24, vr24, vr8 + + //dst-2 + vsadd.hu vr25, vr24, vr18 + vsadd.hu vr25, vr25, vr13 + vssub.hu vr25, vr25, vr6 + vssub.hu vr25, vr25, vr9 + + //dst-1 + vsadd.hu vr26, vr25, vr19 + vsadd.hu vr26, vr26, vr14 + vssub.hu vr26, vr26, vr6 + vssub.hu vr26, vr26, vr10 + + //dst+0 + vsadd.hu vr27, vr26, vr20 + vsadd.hu vr27, vr27, vr15 + vssub.hu vr27, vr27, vr6 + vssub.hu vr27, vr27, vr12 + + //dst+1 + vsadd.hu vr28, vr27, vr20 + vsadd.hu vr28, vr28, vr16 + vssub.hu vr28, vr28, vr7 + vssub.hu vr28, vr28, vr13 + + //dst+2 + vsadd.hu vr29, vr28, vr20 + vsadd.hu vr29, vr29, vr17 + vssub.hu vr29, vr29, vr8 + vssub.hu vr29, vr29, vr14 + + //dst+3 + vsadd.hu vr30, vr29, vr20 + vsadd.hu vr30, vr30, vr18 + vssub.hu vr30, vr30, vr9 + vssub.hu vr30, vr30, vr15 + + //dst+4 + vsadd.hu vr31, vr30, vr20 + vsadd.hu vr31, vr31, vr19 + vssub.hu vr31, vr31, vr10 + vssub.hu vr31, vr31, vr16 + + //dst+5 + vsadd.hu vr11, vr31, vr20 + vsadd.hu vr11, vr11, vr20 + vssub.hu vr11, vr11, vr12 + vssub.hu vr11, vr11, vr17 + + vsrari.h vr21, vr21, 4 + vsrari.h vr22, vr22, 4 + vsrari.h vr23, vr23, 4 + vsrari.h vr24, vr24, 4 + vsrari.h vr25, vr25, 4 + vsrari.h vr26, vr26, 4 + vsrari.h vr27, vr27, 4 + vsrari.h vr28, vr28, 4 + vsrari.h vr29, vr29, 4 + vsrari.h vr30, vr30, 4 + vsrari.h vr31, vr31, 4 + vsrari.h vr11, vr11, 4 + + vand.v vr1, vr1, vr3 + vsllwil.h.b vr1, vr1, 0 //expand to h + //(flat8out & flat8in) + vbitsel.v vr21, vr7, vr21, vr1 //dst-6 + vbitsel.v vr22, vr8, vr22, vr1 //dst-5 + vbitsel.v vr23, vr9, vr23, vr1 //dst-4 + vbitsel.v vr30, vr17, vr30, vr1 //dst+3 + vbitsel.v vr31, vr18, vr31, vr1 //dst+4 + vbitsel.v vr11, vr19, vr11, vr1 //dst+5 + + //flat8in + //dst-3 + vslli.h vr4, vr9, 1 + vsadd.hu vr4, vr4, vr9 //p3*3 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr10 + vsadd.hu vr4, vr4, vr12 + vsadd.hu vr4, vr4, vr13 + vsadd.hu vr4, vr4, vr14 + + //dst-2 + vsadd.hu vr5, vr4, vr12 + vsadd.hu vr5, vr5, vr15 + vssub.hu vr5, vr5, vr9 + vssub.hu vr5, vr5, vr10 + + //dst-1 + vsadd.hu vr18, vr5, vr13 + vsadd.hu vr18, vr18, vr16 + vssub.hu vr18, vr18, vr9 + vssub.hu vr18, vr18, vr12 + + //dst+0 + vsadd.hu vr7, vr18, vr14 + vsadd.hu vr7, vr7, vr17 + vssub.hu vr7, vr7, vr9 + vssub.hu vr7, vr7, vr13 + + //dst+1 + vsadd.hu vr8, vr7, vr15 + vsadd.hu vr8, vr8, vr17 + vssub.hu vr8, vr8, vr10 + vssub.hu vr8, vr8, vr14 + + //dst+2 + vsadd.hu vr9, vr8, vr16 + vsadd.hu vr9, vr9, vr17 + vssub.hu vr9, vr9, vr12 + vssub.hu vr9, vr9, vr15 + + vsrari.h vr4, vr4, 3 + vsrari.h vr5, vr5, 3 + vsrari.h vr18, vr18, 3 + vsrari.h vr7, vr7, 3 + vsrari.h vr8, vr8, 3 + vsrari.h vr9, vr9, 3 + + //flat8out & flat8in + vbitsel.v vr24, vr4, vr24, vr1 //dst-3 + vbitsel.v vr25, vr5, vr25, vr1 //dst-2 + vbitsel.v vr26, vr18, vr26, vr1 //dst-1 + vbitsel.v vr27, vr7, vr27, vr1 //dst+0 + vbitsel.v vr28, vr8, vr28, vr1 //dst+1 + vbitsel.v vr29, vr9, vr29, vr1 //dst+2 + + //!flat8in + vsub.h vr17, vr12, vr15 //p1-q1 + vsllwil.h.b vr2, vr2, 0 + vand.v vr17, vr17, vr2 //&hev + vssrani.b.h vr17, vr17, 0 + vsllwil.h.b vr17, vr17, 0 + + vsub.h vr7, vr14, vr13 + vsadd.h vr8, vr7, vr7 + vsadd.h vr7, vr7, vr8 + vsadd.h vr7, vr7, vr17 + vssrani.b.h vr7, vr7, 0 + vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f); + + vaddi.hu vr7, vr17, 4 + vaddi.hu vr8, vr17, 3 + li.w t5, 127 + vreplgr2vr.h vr9, t5 + vmin.h vr7, vr7, vr9 + vmin.h vr8, vr8, vr9 + vsrai.h vr7, vr7, 3 //f1 + vsrai.h vr8, vr8, 3 //f2 + + vsadd.h vr4, vr13, vr8 //dst-1 + vssub.h vr5, vr14, vr7 //dst+0 + + vsrari.h vr7, vr7, 1 + vsadd.h vr17, vr12, vr7 + vssub.h vr7, vr15, vr7 + vbitsel.v vr17, vr17, vr12, vr2 //dst-2 + vbitsel.v vr7, vr7, vr15, vr2 //dst+1 + + //flat8in or !flat8in + vsllwil.h.b vr3, vr3, 0 + vbitsel.v vr24, vr10, vr24, vr3 //dst-3 + vbitsel.v vr25, vr17, vr25, vr3 //dst-2 + vbitsel.v vr26, vr4, vr26, vr3 //dst-1 + vbitsel.v vr27, vr5, vr27, vr3 //dst+0 + vbitsel.v vr28, vr7, vr28, vr3 //dst+1 + vbitsel.v vr29, vr16, vr29, vr3 //dst+2 + +.ifc \DIR, h + //dst-6,dst-2,dst-5,dst-1 + vssrani.bu.h vr25, vr21, 0 + vssrani.bu.h vr26, vr22, 0 + vpermi.w vr25, vr25, 0xd8 + vpermi.w vr26, vr26, 0xd8 + vilvl.b vr6, vr26, vr25 //65656565 21212121 + + //dst-4,dst+0,dst-3,dst+1 + vssrani.bu.h vr27, vr23, 0 + vssrani.bu.h vr28, vr24, 0 + vpermi.w vr27, vr27, 0xd8 + vpermi.w vr28, vr28, 0xd8 + vilvl.b vr26, vr28, vr27 //43434343 01010101 + + vilvl.h vr21, vr26, vr6 //6543 -- -- -- + vilvh.h vr22, vr26, vr6 //2101 -- -- -- + vilvl.w vr20, vr22, vr21 //65432101 -- + vilvh.w vr22, vr22, vr21 //65432101 -- + vreplvei.d vr21, vr20, 1 + vreplvei.d vr23, vr22, 1 + + //dst+2,dst+4,dst+3,dst+5 + vssrani.bu.h vr31, vr29, 0 + vssrani.bu.h vr11, vr30, 0 + vpermi.w vr31, vr31, 0xd8 + vpermi.w vr11, vr11, 0xd8 + vilvl.b vr11, vr11, vr31 //23232323 45454545 + vshuf4i.w vr11, vr11, 0xd8 + vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- -- + + vextrins.w vr20, vr11, 0x20 + vextrins.w vr21, vr11, 0x21 + vextrins.w vr22, vr11, 0x22 + vextrins.w vr23, vr11, 0x23 + + addi.d t5, a0, -6 + vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 + vldx vr7, t5, a1 + add.d t5, t5, a1 + vldx vr8, t5, a1 + add.d t5, t5, a1 + vldx vr9, t5, a1 + + //expand fm to 128 + vreplvei.b vr10, vr0, 0 + vreplvei.b vr11, vr0, 1 + vreplvei.b vr12, vr0, 2 + vreplvei.b vr13, vr0, 3 + + vbitsel.v vr20, vr6, vr20, vr10 + vbitsel.v vr21, vr7, vr21, vr11 + vbitsel.v vr22, vr8, vr22, vr12 + vbitsel.v vr23, vr9, vr23, vr13 + + addi.d t5, a0, -6 + vstelm.d vr20, t5, 0, 0 + vstelm.w vr20, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr21, t5, 0, 0 + vstelm.w vr21, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr22, t5, 0, 0 + vstelm.w vr22, t5, 8, 2 + add.d t5, t5, a1 + vstelm.d vr23, t5, 0, 0 + vstelm.w vr23, t5, 8, 2 +.else + //reload + fld.d f7, sp, 0 + fld.d f8, sp, 8 + fld.d f9, sp, 16 + fld.d f10, sp, 24 + fld.d f12, sp, 32 + fld.d f13, sp, 40 + fld.d f14, sp, 48 + fld.d f15, sp, 56 + fld.d f16, sp, 64 + fld.d f17, sp, 72 + fld.d f18, sp, 80 + fld.d f19, sp, 88 + + vssrarni.bu.h vr21, vr21, 0 + vssrarni.bu.h vr22, vr22, 0 + vssrarni.bu.h vr23, vr23, 0 + vssrarni.bu.h vr24, vr24, 0 + vssrarni.bu.h vr25, vr25, 0 + vssrarni.bu.h vr26, vr26, 0 + vssrarni.bu.h vr27, vr27, 0 + vssrarni.bu.h vr28, vr28, 0 + vssrarni.bu.h vr29, vr29, 0 + vssrarni.bu.h vr30, vr30, 0 + vssrarni.bu.h vr31, vr31, 0 + vssrarni.bu.h vr11, vr11, 0 + + vbitsel.v vr7, vr7, vr21, vr0 //p5 + vbitsel.v vr8, vr8, vr22, vr0 //p4 + vbitsel.v vr9, vr9, vr23, vr0 //p3 + vbitsel.v vr10, vr10, vr24, vr0 //p2 + vbitsel.v vr12, vr12, vr25, vr0 //p1 + vbitsel.v vr13, vr13, vr26, vr0 //p0 + vbitsel.v vr14, vr14, vr27, vr0 //q0 + vbitsel.v vr15, vr15, vr28, vr0 //q1 + vbitsel.v vr16, vr16, vr29, vr0 //q2 + vbitsel.v vr17, vr17, vr30, vr0 //q3 + vbitsel.v vr18, vr18, vr31, vr0 //q4 + vbitsel.v vr19, vr19, vr11, vr0 //q5 + + fst.s f14, a0, 0 + fstx.s f15, a0, a1 + alsl.d t5, a1, a0, 1 + fst.s f16, t5, 0 + fstx.s f17, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f18, t5, 0 + fstx.s f19, t5, a1 + + slli.w t5, a1, 2 + alsl.d t5, a1, t5, 1 + sub.d t5, a0, t5 + fst.s f7, t5, 0 + fstx.s f8, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f9, t5, 0 + fstx.s f10, t5, a1 + alsl.d t5, a1, t5, 1 + fst.s f12, t5, 0 + fstx.s f13, t5, a1 +.endif +.END_FILTER_\DIR\()\TYPE\()_W16: +.ifc \DIR, v + addi.d sp, sp, 96 +.endif +.endm + +.macro PUSH_REG + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 +.endm +.macro POP_REG + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.endm + +.macro LPF_FUNC DIR, TYPE +function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx + PUSH_REG + vld vr0, a2, 0 //vmask + vpickve2gr.wu t0, vr0, 0 + vpickve2gr.wu t1, vr0, 1 + vpickve2gr.wu t2, vr0, 2 + li.w t3, 1 //y + or t0, t0, t1 +.ifc \TYPE, y + or t0, t0, t2 //vm +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + beqz t0, .\DIR\()\TYPE\()_END +.\DIR\()\TYPE\()_LOOP: + and t4, t0, t3 //vm & y + beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT + vldrepl.b vr1, a3, 0 //l[0][0] +.ifc \DIR, h + addi.d t5, a3, -4 +.else + slli.d t5, a4, 2 + sub.d t5, a3, t5 +.endif + vldrepl.b vr2, t5, 0 //l[-1][0] + vseqi.b vr3, vr1, 0 + vbitsel.v vr1, vr1, vr2, vr3 //L + vpickve2gr.b t5, vr1, 0 + beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT + vsrai.b vr2, vr1, 4 //H + add.d t6, a5, t5 + vldrepl.b vr3, t6, 0 //E + addi.d t6, t6, 64 + vldrepl.b vr4, t6, 0 //I +.ifc \TYPE, y + and t5, t2, t3 + bnez t5, .FILTER_\DIR\()\TYPE\()_16 +.endif + and t5, t1, t3 +.ifc \TYPE, y + bnez t5, .FILTER_\DIR\()\TYPE\()_8 +.else + bnez t5, .FILTER_\DIR\()\TYPE\()_6 +.endif + FILTER_W4 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.ifc \TYPE, uv +.FILTER_\DIR\()\TYPE\()_6: + FILTER_W6 \DIR, \TYPE +.endif +.ifc \TYPE, y +.FILTER_\DIR\()\TYPE\()_8: + FILTER_W8 \DIR, \TYPE + b .\DIR\()\TYPE\()_LOOP_NEXT +.FILTER_\DIR\()\TYPE\()_16: + FILTER_W16 \DIR, \TYPE +.endif +.\DIR\()\TYPE\()_LOOP_NEXT: + slli.w t3, t3, 1 +.ifc \DIR, h + alsl.d a0, a1, a0, 2 + slli.w t8, a4, 2 + add.d a3, a3, t8 +.else + addi.d a0, a0, 4 + addi.d a3, a3, 4 +.endif + addi.w t8, t3, -1 + andn t8, t0, t8 + bnez t8, .\DIR\()\TYPE\()_LOOP +.\DIR\()\TYPE\()_END: + POP_REG +endfunc +.endm + +LPF_FUNC h, y +LPF_FUNC v, y +LPF_FUNC h, uv +LPF_FUNC v, uv diff --git a/third_party/dav1d/src/loongarch/loopfilter.h b/third_party/dav1d/src/loongarch/loopfilter.h new file mode 100644 index 0000000000..844faf0c30 --- /dev/null +++ b/third_party/dav1d/src/loongarch/loopfilter.h @@ -0,0 +1,52 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H +#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx)); + +static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + +#if BITDEPTH == 8 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx); +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */ diff --git a/third_party/dav1d/src/loongarch/looprestoration.S b/third_party/dav1d/src/loongarch/looprestoration.S new file mode 100644 index 0000000000..ab512d133c --- /dev/null +++ b/third_party/dav1d/src/loongarch/looprestoration.S @@ -0,0 +1,1407 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +#define REST_UNIT_STRIDE (400) + +.macro MADD_HU_BU in0, in1, out0, out1 + vsllwil.hu.bu vr12, \in0, 0 + vexth.hu.bu vr13, \in0 + vmadd.h \out0, vr12, \in1 + vmadd.h \out1, vr13, \in1 +.endm + +const wiener_shuf +.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +endconst + +/* +void wiener_filter_h_lsx(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[8], + const int w, const int h) +*/ +function wiener_filter_h_8bpc_lsx + addi.d sp, sp, -40 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + li.w t7, 1<<14 // clip_limit + + la.local t1, wiener_shuf + vld vr4, t1, 0 + vld vr14, a2, 0 // filter[0][k] + vreplvei.h vr21, vr14, 0 + vreplvei.h vr22, vr14, 1 + vreplvei.h vr23, vr14, 2 + vreplvei.h vr24, vr14, 3 + vreplvei.h vr25, vr14, 4 + vreplvei.h vr26, vr14, 5 + vreplvei.h vr27, vr14, 6 + vreplgr2vr.w vr0, t7 + +.WIENER_FILTER_H_H: + addi.w a4, a4, -1 // h + addi.w t0, a3, 0 // w + addi.d t1, a1, 0 // tmp_ptr + addi.d t2, a0, 0 // hor_ptr + +.WIENER_FILTER_H_W: + addi.w t0, t0, -16 + vld vr5, t1, 0 + vld vr13, t1, 16 + + vsubi.bu vr14, vr4, 2 + vsubi.bu vr15, vr4, 1 + vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16 + vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17 + vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18 + vaddi.bu vr14, vr4, 1 + vaddi.bu vr15, vr4, 2 + vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19 + vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20 + vaddi.bu vr14, vr4, 3 + vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21 + + vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10 + vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18 + vsllwil.wu.hu vr17, vr15, 0 // 3 4 5 6 + vexth.wu.hu vr18, vr15 // 7 8 9 10 + vsllwil.wu.hu vr19, vr16, 0 // 11 12 13 14 + vexth.wu.hu vr20, vr16 // 15 16 17 18 + vslli.w vr17, vr17, 7 + vslli.w vr18, vr18, 7 + vslli.w vr19, vr19, 7 + vslli.w vr20, vr20, 7 + vxor.v vr15, vr15, vr15 + vxor.v vr14, vr14, vr14 + + MADD_HU_BU vr5, vr21, vr14, vr15 + MADD_HU_BU vr6, vr22, vr14, vr15 + MADD_HU_BU vr7, vr23, vr14, vr15 + MADD_HU_BU vr8, vr24, vr14, vr15 + MADD_HU_BU vr9, vr25, vr14, vr15 + MADD_HU_BU vr10, vr26, vr14, vr15 + MADD_HU_BU vr11, vr27, vr14, vr15 + + vsllwil.w.h vr5, vr14, 0 // 0 1 2 3 + vexth.w.h vr6, vr14 // 4 5 6 7 + vsllwil.w.h vr7, vr15, 0 // 8 9 10 11 + vexth.w.h vr8, vr15 // 12 13 14 15 + vadd.w vr17, vr17, vr5 + vadd.w vr18, vr18, vr6 + vadd.w vr19, vr19, vr7 + vadd.w vr20, vr20, vr8 + vadd.w vr17, vr17, vr0 + vadd.w vr18, vr18, vr0 + vadd.w vr19, vr19, vr0 + vadd.w vr20, vr20, vr0 + + vsrli.w vr1, vr0, 1 + vsubi.wu vr1, vr1, 1 + vxor.v vr3, vr3, vr3 + vsrari.w vr17, vr17, 3 + vsrari.w vr18, vr18, 3 + vsrari.w vr19, vr19, 3 + vsrari.w vr20, vr20, 3 + vclip.w vr17, vr17, vr3, vr1 + vclip.w vr18, vr18, vr3, vr1 + vclip.w vr19, vr19, vr3, vr1 + vclip.w vr20, vr20, vr3, vr1 + + vst vr17, t2, 0 + vst vr18, t2, 16 + vst vr19, t2, 32 + vst vr20, t2, 48 + addi.d t1, t1, 16 + addi.d t2, t2, 64 + blt zero, t0, .WIENER_FILTER_H_W + + addi.d a1, a1, REST_UNIT_STRIDE + addi.d a0, a0, (REST_UNIT_STRIDE << 2) + bnez a4, .WIENER_FILTER_H_H + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + addi.d sp, sp, 40 +endfunc + +.macro APPLY_FILTER in0, in1, in2 + alsl.d t7, \in0, \in1, 2 + vld vr10, t7, 0 + vld vr11, t7, 16 + vld vr12, t7, 32 + vld vr13, t7, 48 + vmadd.w vr14, vr10, \in2 + vmadd.w vr15, vr11, \in2 + vmadd.w vr16, vr12, \in2 + vmadd.w vr17, vr13, \in2 +.endm + +.macro wiener_filter_v_8bpc_core_lsx + vreplgr2vr.w vr14, t6 + vreplgr2vr.w vr15, t6 + vreplgr2vr.w vr16, t6 + vreplgr2vr.w vr17, t6 + + addi.w t7, t2, 0 // j + index k + mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE + add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i + + APPLY_FILTER t7, a2, vr2 + APPLY_FILTER t8, t7, vr3 + APPLY_FILTER t8, t7, vr4 + APPLY_FILTER t8, t7, vr5 + APPLY_FILTER t8, t7, vr6 + APPLY_FILTER t8, t7, vr7 + APPLY_FILTER t8, t7, vr8 + vssrarni.hu.w vr15, vr14, 11 + vssrarni.hu.w vr17, vr16, 11 + vssrlni.bu.h vr17, vr15, 0 +.endm + +/* +void wiener_filter_v_lsx(uint8_t *p, + const ptrdiff_t p_stride, + const int32_t *hor, + const int16_t filterv[8], + const int w, const int h) +*/ +function wiener_filter_v_8bpc_lsx + li.w t6, -(1 << 18) + + li.w t8, REST_UNIT_STRIDE + ld.h t0, a3, 0 + ld.h t1, a3, 2 + vreplgr2vr.w vr2, t0 + vreplgr2vr.w vr3, t1 + ld.h t0, a3, 4 + ld.h t1, a3, 6 + vreplgr2vr.w vr4, t0 + vreplgr2vr.w vr5, t1 + ld.h t0, a3, 8 + ld.h t1, a3, 10 + vreplgr2vr.w vr6, t0 + vreplgr2vr.w vr7, t1 + ld.h t0, a3, 12 + vreplgr2vr.w vr8, t0 + + andi t1, a4, 0xf + sub.w t0, a4, t1 // w-w%16 + or t2, zero, zero // j + or t4, zero, zero + beqz t0, .WIENER_FILTER_V_W_LT16 + +.WIENER_FILTER_V_H: + andi t1, a4, 0xf + add.d t3, zero, a0 // p + or t4, zero, zero // i + +.WIENER_FILTER_V_W: + + wiener_filter_v_8bpc_core_lsx + + mul.w t5, t2, a1 // j * stride + add.w t5, t5, t4 // j * stride + i + add.d t3, a0, t5 + addi.w t4, t4, 16 + vst vr17, t3, 0 + bne t0, t4, .WIENER_FILTER_V_W + + beqz t1, .WIENER_FILTER_V_W_EQ16 + + wiener_filter_v_8bpc_core_lsx + + addi.d t3, t3, 16 + andi t1, a4, 0xf + +.WIENER_FILTER_V_ST_REM: + vstelm.b vr17, t3, 0, 0 + vbsrl.v vr17, vr17, 1 + addi.d t3, t3, 1 + addi.w t1, t1, -1 + bnez t1, .WIENER_FILTER_V_ST_REM +.WIENER_FILTER_V_W_EQ16: + addi.w t2, t2, 1 + blt t2, a5, .WIENER_FILTER_V_H + b .WIENER_FILTER_V_END + +.WIENER_FILTER_V_W_LT16: + andi t1, a4, 0xf + add.d t3, zero, a0 + + wiener_filter_v_8bpc_core_lsx + + mul.w t5, t2, a1 // j * stride + add.d t3, a0, t5 + +.WIENER_FILTER_V_ST_REM_1: + vstelm.b vr17, t3, 0, 0 + vbsrl.v vr17, vr17, 1 + addi.d t3, t3, 1 + addi.w t1, t1, -1 + bnez t1, .WIENER_FILTER_V_ST_REM_1 + + addi.w t2, t2, 1 + blt t2, a5, .WIENER_FILTER_V_W_LT16 + +.WIENER_FILTER_V_END: +endfunc + +/* +void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src, + const int w, const int h) +*/ +function boxsum3_h_8bpc_lsx + addi.d a2, a2, REST_UNIT_STRIDE + li.w t0, 1 + addi.w a3, a3, -2 + addi.w a4, a4, -4 + +.LBS3_H_H: + alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x + alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x + add.d t3, t0, a2 // s + addi.w t5, a3, 0 +.LBS3_H_W: + vld vr0, t3, 0 + vld vr1, t3, REST_UNIT_STRIDE + vld vr2, t3, (REST_UNIT_STRIDE<<1) + + vilvl.b vr3, vr1, vr0 + vhaddw.hu.bu vr4, vr3, vr3 + vilvh.b vr5, vr1, vr0 + vhaddw.hu.bu vr6, vr5, vr5 + vsllwil.hu.bu vr7, vr2, 0 + vexth.hu.bu vr8, vr2 + // sum_v + vadd.h vr4, vr4, vr7 + vadd.h vr6, vr6, vr8 + vst vr4, t1, REST_UNIT_STRIDE<<1 + vst vr6, t1, (REST_UNIT_STRIDE<<1)+16 + addi.d t1, t1, 32 + // sumsq + vmulwev.h.bu vr9, vr3, vr3 + vmulwod.h.bu vr10, vr3, vr3 + vmulwev.h.bu vr11, vr5, vr5 + vmulwod.h.bu vr12, vr5, vr5 + vmul.h vr7, vr7, vr7 + vmul.h vr8, vr8, vr8 + vaddwev.w.hu vr13, vr10, vr9 + vaddwod.w.hu vr14, vr10, vr9 + vilvl.w vr3, vr14, vr13 + vilvh.w vr4, vr14, vr13 + vaddwev.w.hu vr13, vr12, vr11 + vaddwod.w.hu vr14, vr12, vr11 + vilvl.w vr15, vr14, vr13 + vilvh.w vr16, vr14, vr13 + vsllwil.wu.hu vr9, vr7, 0 + vexth.wu.hu vr10, vr7 + vsllwil.wu.hu vr11, vr8, 0 + vexth.wu.hu vr12, vr8 + vadd.w vr9, vr9, vr3 + vadd.w vr10, vr10, vr4 + vadd.w vr11, vr11, vr15 + vadd.w vr12, vr12, vr16 + vst vr9, t2, REST_UNIT_STRIDE<<2 + vst vr10, t2, (REST_UNIT_STRIDE<<2)+16 + vst vr11, t2, (REST_UNIT_STRIDE<<2)+32 + vst vr12, t2, (REST_UNIT_STRIDE<<2)+48 + addi.d t2, t2, 64 + + addi.w t5, t5, -16 + addi.d t3, t3, 16 + blt zero, t5, .LBS3_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE + addi.d a4, a4, -1 + blt zero, a4, .LBS3_H_H + +.LBS3_H_END: +endfunc + +/* +void boxsum3_v(int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum3_v_8bpc_lsx + addi.d a0, a0, (REST_UNIT_STRIDE<<2) + addi.d a1, a1, (REST_UNIT_STRIDE<<1) + addi.w a3, a3, -4 + addi.w a2, a2, -4 + +.LBS3_V_H: + sub.w t3, a2, zero + addi.d t0, a0, 4 + addi.d t1, a1, 2 + addi.d t5, a0, 8 + addi.d t6, a1, 4 + + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 + vld vr3, t0, 0 // a2 0 1 2 3 + vld vr4, t0, 4 // b2 1 2 3 4 + vld vr5, t0, 8 // c2 2 3 4 5 + vld vr6, t0, 16 // 3 4 5 6 + vld vr7, t0, 20 // 4 5 6 7 + vld vr8, t0, 24 // 5 6 7 8 + vadd.h vr9, vr0, vr1 + vadd.h vr9, vr9, vr2 + vadd.w vr10, vr3, vr4 + vadd.w vr10, vr10, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr11, vr11, vr8 + vpickve2gr.h t7, vr2, 6 + vpickve2gr.w t8, vr8, 2 + vst vr9, t6, 0 + vst vr10, t5, 0 + vst vr11, t5, 16 + + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 16 + addi.d t3, t3, -8 + ble t3, zero, .LBS3_V_H0 + +.LBS3_V_W8: + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9 + vld vr3, t0, 0 // a2 0 1 2 3 + vld vr4, t0, 4 // b2 1 2 3 4 + vld vr5, t0, 8 // c2 2 3 4 5 + vld vr6, t0, 16 // 3 4 5 6 + vld vr7, t0, 20 // 4 5 6 7 + vld vr8, t0, 24 // 5 6 7 8 + vinsgr2vr.h vr0, t7, 0 + vinsgr2vr.w vr3, t8, 0 + vpickve2gr.h t7, vr2, 6 + vpickve2gr.w t8, vr8, 2 + vadd.h vr9, vr0, vr1 + vadd.w vr10, vr3, vr4 + vadd.w vr11, vr6, vr7 + vadd.h vr9, vr9, vr2 + vadd.w vr10, vr10, vr5 + vadd.w vr11, vr11, vr8 + vst vr9, t6, 0 + vst vr10, t5, 0 + vst vr11, t5, 16 + addi.d t3, t3, -8 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 16 + blt zero, t3, .LBS3_V_W8 + +.LBS3_V_H0: + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -1 + bnez a3, .LBS3_V_H + +.LBS3_V_END: +endfunc + +/* +boxsum3_selfguided_filter(int32_t *sumsq, coef *sum, + const int w, const int h, + const unsigned s) +*/ +function boxsum3_sgf_h_8bpc_lsx + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, 12 // AA + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a1, a1, 6 // BB + la.local t8, dav1d_sgr_x_by_x + li.w t6, 455 + vreplgr2vr.w vr20, t6 + li.w t6, 255 + vreplgr2vr.w vr22, t6 + vaddi.wu vr21, vr22, 1 // 256 + vreplgr2vr.w vr6, a4 + vldi vr19, 0x809 + addi.w a2, a2, 2 // w + 2 + addi.w a3, a3, 2 // h + 2 + +.LBS3SGF_H_H: + addi.w t2, a2, 0 + addi.d t0, a0, -4 + addi.d t1, a1, -2 + +.LBS3SGF_H_W: + addi.w t2, t2, -8 + vld vr0, t0, 0 // AA[i] + vld vr1, t0, 16 + vld vr2, t1, 0 // BB[i] + + vmul.w vr4, vr0, vr19 // a * n + vmul.w vr5, vr1, vr19 // a * n + vsllwil.w.h vr9, vr2, 0 + vexth.w.h vr10, vr2 + vmsub.w vr4, vr9, vr9 // p + vmsub.w vr5, vr10, vr10 // p + vmaxi.w vr4, vr4, 0 + vmaxi.w vr5, vr5, 0 // p + vmul.w vr4, vr4, vr6 // p * s + vmul.w vr5, vr5, vr6 // p * s + vsrlri.w vr4, vr4, 20 + vsrlri.w vr5, vr5, 20 // z + vmin.w vr4, vr4, vr22 + vmin.w vr5, vr5, vr22 + + vpickve2gr.w t6, vr4, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 0 + vpickve2gr.w t6, vr4, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 1 + vpickve2gr.w t6, vr4, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 2 + vpickve2gr.w t6, vr4, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 3 + + vpickve2gr.w t6, vr5, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 0 + vpickve2gr.w t6, vr5, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 1 + vpickve2gr.w t6, vr5, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 2 + vpickve2gr.w t6, vr5, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 3 // x + + vmul.w vr9, vr7, vr9 // x * BB[i] + vmul.w vr10, vr8, vr10 + vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x + vmul.w vr10, vr10, vr20 + vsrlri.w vr9, vr9, 12 + vsrlri.w vr10, vr10, 12 + vsub.w vr7, vr21, vr7 + vsub.w vr8, vr21, vr8 + vpickev.h vr8, vr8, vr7 + + vst vr9, t0, 0 + vst vr10, t0, 16 + vst vr8, t1, 0 + addi.d t0, t0, 32 + addi.d t1, t1, 16 + blt zero, t2, .LBS3SGF_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.w a3, a3, -1 + bnez a3, .LBS3SGF_H_H +endfunc + +/* +boxsum3_selfguided_filter(coef *dst, pixel *src, + int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum3_sgf_v_8bpc_lsx + addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src + addi.d a2, a2, REST_UNIT_STRIDE<<2 + addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12 + addi.d a3, a3, REST_UNIT_STRIDE<<2 + addi.d a3, a3, 6 +.LBS3SGF_V_H: + // A int32_t *sumsq + addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride + addi.d t1, a2, 0 // sumsq + addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride + addi.d t6, a1, 0 + addi.w t7, a4, 0 + addi.d t8, a0, 0 + // B coef *sum + addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride + addi.d t4, a3, 0 + addi.d t5, a3, REST_UNIT_STRIDE<<1 + +.LBS3SGF_V_W: + vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE] + vld vr1, t0, 16 + vld vr2, t1, -4 // P[i-1] + vld vr3, t1, 12 + vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE] + vld vr5, t2, 16 + vld vr6, t1, 0 // p[i] + vld vr7, t1, 16 + vld vr8, t1, 4 // p[i+1] + vld vr9, t1, 20 + + vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE] + vld vr11, t0, 12 + vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE] + vld vr13, t2, 12 + vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE] + vld vr15, t0, 20 + vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE] + vld vr17, t2, 20 + + vadd.w vr0, vr2, vr0 + vadd.w vr4, vr6, vr4 + vadd.w vr0, vr0, vr8 + vadd.w vr20, vr0, vr4 + vslli.w vr20, vr20, 2 // 0 1 2 3 + vadd.w vr0, vr1, vr3 + vadd.w vr4, vr5, vr7 + vadd.w vr0, vr0, vr9 + vadd.w vr21, vr0, vr4 + vslli.w vr21, vr21, 2 // 4 5 6 7 + vadd.w vr12, vr10, vr12 + vadd.w vr16, vr14, vr16 + vadd.w vr22, vr12, vr16 + vslli.w vr23, vr22, 1 + vadd.w vr22, vr23, vr22 + vadd.w vr11, vr11, vr13 + vadd.w vr15, vr15, vr17 + vadd.w vr0, vr11, vr15 + vslli.w vr23, vr0, 1 + vadd.w vr23, vr23, vr0 + vadd.w vr20, vr20, vr22 // b + vadd.w vr21, vr21, vr23 + + // B coef *sum + vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE] + vld vr1, t4, -2 // p[i - 1] + vld vr2, t4, 0 // p[i] + vld vr3, t4, 2 // p[i + 1] + vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE] + vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE] + vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE] + vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE] + vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE] + vaddwev.w.h vr9, vr0, vr1 + vaddwod.w.h vr10, vr0, vr1 + vaddwev.w.h vr11, vr2, vr3 + vaddwod.w.h vr12, vr2, vr3 + vadd.w vr9, vr11, vr9 + vadd.w vr10, vr12, vr10 + vilvl.w vr11, vr10, vr9 // 0 1 2 3 + vilvh.w vr12, vr10, vr9 // 4 5 6 7 + vsllwil.w.h vr0, vr4, 0 + vexth.w.h vr1, vr4 + vadd.w vr0, vr11, vr0 + vadd.w vr1, vr12, vr1 + vslli.w vr0, vr0, 2 + vslli.w vr1, vr1, 2 + vaddwev.w.h vr9, vr5, vr6 + vaddwod.w.h vr10, vr5, vr6 + vaddwev.w.h vr11, vr7, vr8 + vaddwod.w.h vr12, vr7, vr8 + vadd.w vr9, vr11, vr9 + vadd.w vr10, vr12, vr10 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + vslli.w vr15, vr13, 1 + vslli.w vr16, vr14, 1 + vadd.w vr15, vr13, vr15 // a + vadd.w vr16, vr14, vr16 + vadd.w vr22, vr0, vr15 + vadd.w vr23, vr1, vr16 + vld vr0, t6, 0 // src + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.wu.hu vr1, vr0, 0 + vexth.wu.hu vr2, vr0 + vmadd.w vr20, vr22, vr1 + vmadd.w vr21, vr23, vr2 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, t8, 0 + addi.d t8, t8, 16 + + addi.d t0, t0, 32 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t3, t3, 16 + addi.d t4, t4, 16 + addi.d t5, t5, 16 + addi.d t6, t6, 8 + addi.w t7, t7, -8 + blt zero, t7, .LBS3SGF_V_W + + addi.w a5, a5, -1 + addi.d a0, a0, 384*2 + addi.d a1, a1, REST_UNIT_STRIDE + addi.d a3, a3, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE<<2 + bnez a5, .LBS3SGF_V_H +endfunc + +#define FILTER_OUT_STRIDE (384) + +/* +sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride, + const int16_t *dst, const int w1; + const int w, const int h); +*/ +function sgr_3x3_finish_8bpc_lsx + vreplgr2vr.w vr3, a3 // w1 + andi t4, a4, 0x7 + sub.w t5, a4, t4 + + beq zero, t5, .LSGR3X3_REM + +.LSGR3X3_H: + addi.d t0, a0, 0 + addi.d t1, a2, 0 + addi.w t2, t5, 0 + andi t4, a4, 0x7 +.LSGR3X3_W: + vld vr0, t0, 0 + vld vr1, t1, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + vstelm.d vr7, t0, 0, 0 + addi.d t0, t0, 8 + addi.d t1, t1, 16 + addi.d t2, t2, -8 + bne zero, t2, .LSGR3X3_W + + beq t4, zero, .LSGR3X3_NOREM + + vld vr0, t0, 0 + vld vr1, t1, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGR3X3_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGR3X3_ST + +.LSGR3X3_NOREM: + addi.w a5, a5, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + bnez a5, .LSGR3X3_H + b .LSGR3X3_END + +.LSGR3X3_REM: + andi t4, a4, 0x7 + addi.d t0, a0, 0 + vld vr0, t0, 0 + vld vr1, a2, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGR3X3_REM_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGR3X3_REM_ST + addi.w a5, a5, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + bnez a5, .LSGR3X3_REM + +.LSGR3X3_END: +endfunc + +/* +void boxsum5(int32_t *sumsq, coef *sum, + const pixel *const src, + const int w, const int h) +*/ +function boxsum5_h_8bpc_lsx + addi.w a4, a4, -4 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + li.w t6, 1 +.LBOXSUM5_H_H: + addi.w t3, a3, 0 + addi.d t2, a2, 0 + addi.d t0, a0, 0 + addi.d t1, a1, 0 + +.LBOXSUM5_H_W: + vld vr0, t2, 0 // a + vld vr1, t2, REST_UNIT_STRIDE // b + vld vr2, t2, REST_UNIT_STRIDE<<1 // c + vld vr3, t2, REST_UNIT_STRIDE*3 // d + vld vr4, t2, REST_UNIT_STRIDE<<2 // e + + vilvl.b vr5, vr1, vr0 + vilvh.b vr6, vr1, vr0 + vilvl.b vr7, vr3, vr2 + vilvh.b vr8, vr3, vr2 + //sum_v + vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7 + vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b + vhaddw.hu.bu vr11, vr7, vr7 + vhaddw.hu.bu vr12, vr8, vr8 + vadd.h vr9, vr9, vr11 + vadd.h vr10, vr10, vr12 // a + b + c + d + vsllwil.hu.bu vr11, vr4, 0 + vexth.hu.bu vr12, vr4 + vadd.h vr9, vr9, vr11 + vadd.h vr10, vr10, vr12 + vst vr9, t1, 0 + vst vr10, t1, 16 + addi.d t1, t1, 32 + + // sumsq + vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7 + vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15 + vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7 + vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15 + vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7 + vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15 + vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7 + vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15 + vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6 + vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7 + vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14 + vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b + vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6 + vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7 + vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14 + vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d + vadd.w vr5, vr5, vr19 + vadd.w vr6, vr6, vr20 + vadd.w vr7, vr7, vr21 + vadd.w vr8, vr8, vr22 + vilvl.w vr19, vr6, vr5 + vilvh.w vr20, vr6, vr5 + vilvl.w vr21, vr8, vr7 + vilvh.w vr22, vr8, vr7 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vsllwil.wu.hu vr0, vr11, 0 + vexth.wu.hu vr1, vr11 + vsllwil.wu.hu vr2, vr12, 0 + vexth.wu.hu vr3, vr12 + vadd.w vr19, vr19, vr0 + vadd.w vr20, vr20, vr1 + vadd.w vr21, vr21, vr2 + vadd.w vr22, vr22, vr3 + vst vr19, t0, 0 + vst vr20, t0, 16 + vst vr21, t0, 32 + vst vr22, t0, 48 + addi.d t0, t0, 64 + addi.d t2, t2, 16 + addi.w t3, t3, -16 + blt zero, t3, .LBOXSUM5_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a2, a2, REST_UNIT_STRIDE + addi.d a4, a4, -1 + bnez a4, .LBOXSUM5_H_H +endfunc + +/* +void boxsum5_h(int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum5_v_8bpc_lsx + addi.d a0, a0, (REST_UNIT_STRIDE<<2) + addi.d a1, a1, (REST_UNIT_STRIDE<<1) + addi.w a3, a3, -4 + addi.w a2, a2, -4 + +.LBOXSUM5_V_H: + addi.w t3, a2, 0 + addi.d t0, a0, 0 + addi.d t1, a1, 0 + addi.d t2, a0, 8 + addi.d t3, a1, 4 + addi.d t4, a2, 0 + + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 + vld vr3, t1, 6 // d 3 + vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 + vadd.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vpickve2gr.w t5, vr4, 2 + vadd.h vr5, vr5, vr6 + vadd.h vr5, vr5, vr4 + vst vr5, t3, 0 + + vld vr0, t0, 0 // 0 1 2 3 a + vld vr1, t0, 4 // 1 2 3 4 b + vld vr2, t0, 8 // 2 3 4 5 c + vld vr3, t0, 12 // 3 4 5 6 d + vld vr4, t0, 16 // 4 5 6 7 e a + vld vr5, t0, 20 // 5 6 7 8 b + vld vr6, t0, 24 // 6 7 8 9 c + vld vr7, t0, 28 // 7 8 9 10 d + vld vr8, t0, 32 // 8 9 10 11 e + + vadd.w vr9, vr0, vr1 + vadd.w vr10, vr2, vr3 + vadd.w vr9, vr9, vr10 + vadd.w vr9, vr9, vr4 + vadd.w vr10, vr4, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr10, vr10, vr8 + vadd.w vr10, vr10, vr11 + vst vr9, t2, 0 + vst vr10, t2, 16 + + addi.d t3, t3, 16 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t2, t2, 32 + addi.w t4, t4, -8 + ble t4, zero, .LBOXSUM5_V_H1 + +.LBOXSUM5_V_W: + vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7 + vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8 + vld vr2, t1, 4 // c 2 + vld vr3, t1, 6 // d 3 + vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11 + vinsgr2vr.w vr0, t5, 0 + vpickve2gr.w t5, vr4, 2 + vextrins.h vr1, vr0, 0x01 + vadd.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vadd.h vr5, vr5, vr6 + vadd.h vr5, vr5, vr4 + vst vr5, t3, 0 + + vaddi.hu vr0, vr8, 0 // 8 9 10 11 a + vld vr1, t0, 4 // 9 10 11 12 b + vld vr2, t0, 8 // 10 11 12 13 c + vld vr3, t0, 12 // 14 15 16 17 d + vld vr4, t0, 16 // 15 16 17 18 e a + vld vr5, t0, 20 // 16 17 18 19 b + vld vr6, t0, 24 // 17 18 19 20 c + vld vr7, t0, 28 // 18 19 20 21 d + vld vr8, t0, 32 // 19 20 21 22 e + vextrins.w vr1, vr0, 0x01 + vadd.w vr9, vr0, vr1 + vadd.w vr10, vr2, vr3 + vadd.w vr9, vr9, vr10 + vadd.w vr9, vr9, vr4 + vadd.w vr10, vr4, vr5 + vadd.w vr11, vr6, vr7 + vadd.w vr10, vr10, vr8 + vadd.w vr10, vr10, vr11 + vst vr9, t2, 0 + vst vr10, t2, 16 + + addi.d t3, t3, 16 + addi.d t1, t1, 16 + addi.d t0, t0, 32 + addi.d t2, t2, 32 + addi.w t4, t4, -8 + blt zero, t4, .LBOXSUM5_V_W + +.LBOXSUM5_V_H1: + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -1 + bnez a3, .LBOXSUM5_V_H +endfunc + +/* +selfguided_filter(int32_t *sumsq, coef *sum, + const int w, const int h, + const unsigned s) +*/ +function boxsum5_sgf_h_8bpc_lsx + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, 12 // AA + addi.d a1, a1, REST_UNIT_STRIDE<<1 + addi.d a1, a1, 6 // BB + la.local t8, dav1d_sgr_x_by_x + li.w t6, 164 + vreplgr2vr.w vr20, t6 + li.w t6, 255 + vreplgr2vr.w vr22, t6 + vaddi.wu vr21, vr22, 1 // 256 + vreplgr2vr.w vr6, a4 + vldi vr19, 0x819 + addi.w a2, a2, 2 // w + 2 + addi.w a3, a3, 2 // h + 2 + +.LBS5SGF_H_H: + addi.w t2, a2, 0 + addi.d t0, a0, -4 + addi.d t1, a1, -2 + +.LBS5SGF_H_W: + vld vr0, t0, 0 // AA[i] + vld vr1, t0, 16 + vld vr2, t1, 0 // BB[i] + + vmul.w vr4, vr0, vr19 // a * n + vmul.w vr5, vr1, vr19 // a * n + vsllwil.w.h vr9, vr2, 0 + vexth.w.h vr10, vr2 + vmsub.w vr4, vr9, vr9 // p + vmsub.w vr5, vr10, vr10 // p + vmaxi.w vr4, vr4, 0 + vmaxi.w vr5, vr5, 0 // p + vmul.w vr4, vr4, vr6 // p * s + vmul.w vr5, vr5, vr6 // p * s + vsrlri.w vr4, vr4, 20 + vsrlri.w vr5, vr5, 20 // z + vmin.w vr4, vr4, vr22 + vmin.w vr5, vr5, vr22 + + // load table data + vpickve2gr.w t6, vr4, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 0 + vpickve2gr.w t6, vr4, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 1 + vpickve2gr.w t6, vr4, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 2 + vpickve2gr.w t6, vr4, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr7, t7, 3 + + vpickve2gr.w t6, vr5, 0 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 0 + vpickve2gr.w t6, vr5, 1 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 1 + vpickve2gr.w t6, vr5, 2 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 2 + vpickve2gr.w t6, vr5, 3 + ldx.bu t7, t8, t6 + vinsgr2vr.w vr8, t7, 3 // x + + vmul.w vr9, vr7, vr9 // x * BB[i] + vmul.w vr10, vr8, vr10 + vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x + vmul.w vr10, vr10, vr20 + vsrlri.w vr9, vr9, 12 + vsrlri.w vr10, vr10, 12 + vsub.w vr7, vr21, vr7 + vsub.w vr8, vr21, vr8 + vpickev.h vr8, vr8, vr7 + vst vr9, t0, 0 + vst vr10, t0, 16 + vst vr8, t1, 0 + addi.d t0, t0, 32 + addi.d t1, t1, 16 + addi.w t2, t2, -8 + blt zero, t2, .LBS5SGF_H_W + + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a0, a0, REST_UNIT_STRIDE<<2 + addi.d a1, a1, REST_UNIT_STRIDE<<2 + addi.w a3, a3, -2 + blt zero, a3, .LBS5SGF_H_H +endfunc + +/* +selfguided_filter(coef *dst, pixel *src, + int32_t *sumsq, coef *sum, + const int w, const int h) +*/ +function boxsum5_sgf_v_8bpc_lsx + addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src + addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A + addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 + addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B + addi.w a5, a5, -1 + vldi vr10, 0x806 + vldi vr11, 0x805 + vldi vr22, 0x406 + +.LBS5SGF_V_H: + addi.d t0, a0, 0 + addi.d t1, a1, 0 + addi.d t2, a2, 0 + addi.d t3, a3, 0 + addi.w t4, a4, 0 + + addi.d t5, a0, 384*2 + addi.d t6, a1, REST_UNIT_STRIDE + addi.d t7, a2, REST_UNIT_STRIDE<<2 + addi.d t8, a3, REST_UNIT_STRIDE<<1 // B +.LBS5SGF_V_W: + // a + vld vr0, t3, -REST_UNIT_STRIDE*2 + vld vr1, t3, REST_UNIT_STRIDE*2 + vld vr2, t3, (-REST_UNIT_STRIDE-1)*2 + vld vr3, t3, (REST_UNIT_STRIDE-1)*2 + vld vr4, t3, (1-REST_UNIT_STRIDE)*2 + vld vr5, t3, (1+REST_UNIT_STRIDE)*2 + vaddwev.w.h vr6, vr0, vr1 + vaddwod.w.h vr7, vr0, vr1 + vmul.w vr6, vr6, vr10 + vmul.w vr7, vr7, vr10 + vaddwev.w.h vr8, vr2, vr3 + vaddwod.w.h vr9, vr2, vr3 + vaddwev.w.h vr12, vr4, vr5 + vaddwod.w.h vr13, vr4, vr5 + vadd.w vr8, vr8, vr12 + vadd.w vr9, vr9, vr13 + vmadd.w vr6, vr8, vr11 + vmadd.w vr7, vr9, vr11 + vilvl.w vr18, vr7, vr6 + vilvh.w vr19, vr7, vr6 + // b + vld vr0, t2, -REST_UNIT_STRIDE*4 + vld vr1, t2, -REST_UNIT_STRIDE*4+16 + vld vr2, t2, REST_UNIT_STRIDE*4 + vld vr3, t2, REST_UNIT_STRIDE*4+16 + vld vr4, t2, (-REST_UNIT_STRIDE-1)*4 + vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16 + vld vr8, t2, (REST_UNIT_STRIDE-1)*4 + vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16 + vld vr12, t2, (1-REST_UNIT_STRIDE)*4 + vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16 + vld vr14, t2, (1+REST_UNIT_STRIDE)*4 + vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16 + vadd.w vr0, vr0, vr2 // 0 1 2 3 + vadd.w vr1, vr1, vr3 // 4 5 6 7 + vmul.w vr20, vr0, vr10 + vmul.w vr21, vr1, vr10 + vadd.w vr4, vr4, vr8 // 0 1 2 3 + vadd.w vr5, vr5, vr9 // 4 5 6 7 + vadd.w vr12, vr12, vr14 + vadd.w vr13, vr13, vr15 + vadd.w vr12, vr12, vr4 + vadd.w vr13, vr13, vr5 + vmadd.w vr20, vr12, vr11 + vmadd.w vr21, vr13, vr11 + vld vr2, t1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr20, vr18, vr3 + vmadd.w vr21, vr19, vr4 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, t0, 0 + + addi.d t1, t1, 8 + addi.d t2, t2, 32 + addi.d t3, t3, 16 + + // a + vld vr0, t8, 0 + vld vr1, t8, -2 + vld vr2, t8, 2 + vmulwev.w.h vr3, vr0, vr22 + vmulwod.w.h vr4, vr0, vr22 + vaddwev.w.h vr5, vr1, vr2 + vaddwod.w.h vr6, vr1, vr2 + vmadd.w vr3, vr5, vr11 + vmadd.w vr4, vr6, vr11 + vilvl.w vr19, vr4, vr3 + vilvh.w vr20, vr4, vr3 + // b + vld vr0, t7, 0 + vld vr1, t7, -4 + vld vr2, t7, 4 + vld vr5, t7, 16 + vld vr6, t7, 12 + vld vr7, t7, 20 + vmul.w vr8, vr0, vr10 + vmul.w vr9, vr5, vr10 + vadd.w vr12, vr1, vr2 + vadd.w vr13, vr6, vr7 + vmadd.w vr8, vr12, vr11 + vmadd.w vr9, vr13, vr11 + vld vr2, t6, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr8, vr19, vr3 + vmadd.w vr9, vr20, vr4 + vssrlrni.h.w vr9, vr8, 8 + vst vr9, t0, 384*2 + + addi.d t0, t0, 16 + addi.d t8, t8, 16 + addi.d t7, t7, 32 + addi.d t6, t6, 8 + addi.w t4, t4, -8 + blt zero, t4, .LBS5SGF_V_W + + addi.w a5, a5, -2 + addi.d a0, a0, 384*4 // dst + addi.d a1, a1, REST_UNIT_STRIDE<<1 // src + addi.d a2, a2, REST_UNIT_STRIDE<<2 // + addi.d a2, a2, REST_UNIT_STRIDE<<2 + addi.d a3, a3, REST_UNIT_STRIDE<<2 // + blt zero, a5, .LBS5SGF_V_H + bnez a5, .LBS5SGF_END +.LBS5SGF_V_W1: + // a + vld vr0, a3, -REST_UNIT_STRIDE*2 + vld vr1, a3, REST_UNIT_STRIDE*2 + vld vr2, a3, (-REST_UNIT_STRIDE-1)*2 + vld vr3, a3, (REST_UNIT_STRIDE-1)*2 + vld vr4, a3, (1-REST_UNIT_STRIDE)*2 + vld vr5, a3, (1+REST_UNIT_STRIDE)*2 + vaddwev.w.h vr6, vr0, vr1 + vaddwod.w.h vr7, vr0, vr1 + vmul.w vr6, vr6, vr10 + vmul.w vr7, vr7, vr10 + vaddwev.w.h vr8, vr2, vr3 + vaddwod.w.h vr9, vr2, vr3 + vaddwev.w.h vr12, vr4, vr5 + vaddwod.w.h vr13, vr4, vr5 + vadd.w vr8, vr8, vr12 + vadd.w vr9, vr9, vr13 + vmadd.w vr6, vr8, vr11 + vmadd.w vr7, vr9, vr11 + vilvl.w vr18, vr7, vr6 + vilvh.w vr19, vr7, vr6 + // b + vld vr0, a2, -REST_UNIT_STRIDE*4 + vld vr1, a2, -REST_UNIT_STRIDE*4+16 + vld vr2, a2, REST_UNIT_STRIDE*4 + vld vr3, a2, REST_UNIT_STRIDE*4+16 + vld vr4, a2, (-REST_UNIT_STRIDE-1)*4 + vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16 + vld vr8, a2, (REST_UNIT_STRIDE-1)*4 + vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16 + vld vr12, a2, (1-REST_UNIT_STRIDE)*4 + vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16 + vld vr14, a2, (1+REST_UNIT_STRIDE)*4 + vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16 + vadd.w vr0, vr0, vr2 // 0 1 2 3 + vadd.w vr1, vr1, vr3 // 4 5 6 7 + vmul.w vr20, vr0, vr10 + vmul.w vr21, vr1, vr10 + vadd.w vr4, vr4, vr8 // 0 1 2 3 + vadd.w vr5, vr5, vr9 // 4 5 6 7 + vadd.w vr12, vr12, vr14 + vadd.w vr13, vr13, vr15 + vadd.w vr12, vr12, vr4 + vadd.w vr13, vr13, vr5 + vmadd.w vr20, vr12, vr11 + vmadd.w vr21, vr13, vr11 + vld vr2, a1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.wu.hu vr3, vr2, 0 + vexth.wu.hu vr4, vr2 + vmadd.w vr20, vr18, vr3 + vmadd.w vr21, vr19, vr4 + vssrlrni.h.w vr21, vr20, 9 + vst vr21, a0, 0 + addi.d a3, a3, 16 + addi.d a2, a2, 32 + addi.d a1, a1, 8 + addi.d a0, a0, 16 + addi.w a4, a4, -8 + blt zero, a4, .LBS5SGF_V_W1 +.LBS5SGF_END: +endfunc + +/* +void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride, + const int16_t *dst0, const int16_t *dst1, + const int w0, const int w1, + const int w, const int h); +*/ +function sgr_mix_finish_8bpc_lsx + vreplgr2vr.w vr3, a4 // w0 + vreplgr2vr.w vr13, a5 // w1 + andi t4, a6, 0x7 + sub.w t5, a6, t4 + + beq zero, t5, .LSGRMIX_REM + +.LSGRMIX_H: + addi.d t0, a0, 0 + addi.d t1, a2, 0 // dst0 + addi.d t3, a3, 0 // dst1 + addi.w t2, t5, 0 + andi t4, a6, 0x7 +.LSGRMIX_W: + vld vr0, t0, 0 + vld vr1, t1, 0 + vld vr10, t3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3 + vexth.wu.hu vr5, vr2 // u 4 5 6 7 + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst0 + vexth.w.h vr9, vr1 // dst0 + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + vstelm.d vr7, t0, 0, 0 + addi.d t0, t0, 8 + addi.d t1, t1, 16 + addi.d t3, t3, 16 + addi.d t2, t2, -8 + bne zero, t2, .LSGRMIX_W + + beq t4, zero, .LSGRMIX_W8 + + vld vr0, t0, 0 + vld vr1, t1, 0 + vld vr10, t3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + +.LSGRMIX_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGRMIX_ST + +.LSGRMIX_W8: + addi.w a7, a7, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + addi.d a3, a3, (FILTER_OUT_STRIDE<<1) + bnez a7, .LSGRMIX_H + b .LSGR_MIX_END + +.LSGRMIX_REM: + andi t4, a6, 0x7 + vld vr0, a0, 0 + vld vr1, a2, 0 + vld vr10, a3, 0 + vsllwil.hu.bu vr2, vr0, 4 // u 8 h + vsllwil.wu.hu vr4, vr2, 0 // p + vexth.wu.hu vr5, vr2 // p + vslli.w vr6, vr4, 7 + vslli.w vr7, vr5, 7 + vsllwil.w.h vr8, vr1, 0 // dst + vexth.w.h vr9, vr1 // dst + vsub.w vr8, vr8, vr4 + vsub.w vr9, vr9, vr5 + vmadd.w vr6, vr8, vr3 // v 0 - 3 + vmadd.w vr7, vr9, vr3 // v 4 - 7 + + vsllwil.w.h vr11, vr10, 0 // dst1 + vexth.w.h vr12, vr10 // dst1 + vsub.w vr11, vr11, vr4 + vsub.w vr12, vr12, vr5 + vmadd.w vr6, vr11, vr13 + vmadd.w vr7, vr12, vr13 + + vssrarni.hu.w vr7, vr6, 11 + vssrlni.bu.h vr7, vr7, 0 + addi.d t0, a0, 0 +.LSGRMIX_REM_ST: + vstelm.b vr7, t0, 0, 0 + addi.d t0, t0, 1 + vbsrl.v vr7, vr7, 1 + addi.w t4, t4, -1 + bnez t4, .LSGRMIX_REM_ST + + addi.w a7, a7, -1 + add.d a0, a0, a1 + addi.d a2, a2, (FILTER_OUT_STRIDE<<1) + addi.d a3, a3, (FILTER_OUT_STRIDE<<1) + bnez a7, .LSGRMIX_REM + +.LSGR_MIX_END: +endfunc diff --git a/third_party/dav1d/src/loongarch/looprestoration.h b/third_party/dav1d/src/loongarch/looprestoration.h new file mode 100644 index 0000000000..ac0cb065c8 --- /dev/null +++ b/third_party/dav1d/src/loongarch/looprestoration.h @@ -0,0 +1,78 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H +#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H + +#include "common/intops.h" +#include "src/cpu.h" +#include "src/looprestoration.h" + +void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc) +{ + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + +#if BITDEPTH == 8 + c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx; + + c->sgr[0] = dav1d_sgr_filter_5x5_lsx; + c->sgr[1] = dav1d_sgr_filter_3x3_lsx; + c->sgr[2] = dav1d_sgr_filter_mix_lsx; +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */ diff --git a/third_party/dav1d/src/loongarch/looprestoration_tmpl.c b/third_party/dav1d/src/loongarch/looprestoration_tmpl.c new file mode 100644 index 0000000000..66d0d638f6 --- /dev/null +++ b/third_party/dav1d/src/loongarch/looprestoration_tmpl.c @@ -0,0 +1,274 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/looprestoration.h" + +#if BITDEPTH == 8 + +#define REST_UNIT_STRIDE (400) + +void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[8], + const int w, const int h); + +void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p, + const ptrdiff_t p_stride, + const int32_t *hor, + const int16_t filterv[8], + const int w, const int h); + +// This function refers to the function in the ppc/looprestoration_init_tmpl.c. +static inline void padding(uint8_t *dst, const uint8_t *p, + const ptrdiff_t stride, const uint8_t (*left)[4], + const uint8_t *lpf, int unit_w, const int stripe_h, + const enum LrEdgeFlags edges) +{ + const int have_left = !!(edges & LR_HAVE_LEFT); + const int have_right = !!(edges & LR_HAVE_RIGHT); + + // Copy more pixels if we don't have to pad them + unit_w += 3 * have_left + 3 * have_right; + uint8_t *dst_l = dst + 3 * !have_left; + p -= 3 * have_left; + lpf -= 3 * have_left; + + if (edges & LR_HAVE_TOP) { + // Copy previous loop filtered rows + const uint8_t *const above_1 = lpf; + const uint8_t *const above_2 = above_1 + PXSTRIDE(stride); + pixel_copy(dst_l, above_1, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); + } else { + // Pad with first row + pixel_copy(dst_l, p, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w); + if (have_left) { + pixel_copy(dst_l, &left[0][1], 3); + pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3); + } + } + + uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; + if (edges & LR_HAVE_BOTTOM) { + // Copy next loop filtered rows + const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride); + const uint8_t *const below_2 = below_1 + PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); + } else { + // Pad with last row + const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); + if (have_left) { + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + } + } + + // Inner UNIT_WxSTRIPE_H + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); + dst_tl += REST_UNIT_STRIDE; + p += PXSTRIDE(stride); + } + + if (!have_right) { + uint8_t *pad = dst_l + unit_w; + uint8_t *row_last = &dst_l[unit_w - 1]; + // Pad 3x(STRIPE_H+6) with last column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(pad, *row_last, 3); + pad += REST_UNIT_STRIDE; + row_last += REST_UNIT_STRIDE; + } + } + + if (!have_left) { + // Pad 3x(STRIPE_H+6) with first column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(dst, *dst_l, 3); + dst += REST_UNIT_STRIDE; + dst_l += REST_UNIT_STRIDE; + } + } else { + dst += 3 * REST_UNIT_STRIDE; + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst, &left[j][1], 3); + dst += REST_UNIT_STRIDE; + } + } +} + +// This function refers to the function in the ppc/looprestoration_init_tmpl.c. + +// FIXME Could split into luma and chroma specific functions, +// (since first and last tops are always 0 for chroma) +// FIXME Could implement a version that requires less temporary memory +// (should be possible to implement with only 6 rows of temp storage) +void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + + // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); + + BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6); + BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h); +} + +void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src, + const int w, const int h); +void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h, const int w1); +void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp, + int32_t *sumsq, int16_t *sum, + const int w, const int h); +void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride, + int16_t *dst, int w1, + const int w, const int h); + + +static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src, + const int w, const int h) +{ + BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6); + BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6); +} + +void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum3_lsx(sumsq, sum, tmp, w, h); + BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1); + BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); + BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h); +} + +void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum, + const uint8_t *const src, + const int w, const int h); + +void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const unsigned s); + +void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src, + int32_t *sumsq, int16_t *sum, + const int w, const int h); + +void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride, + const int16_t *dst0, const int16_t *dst1, + const int w0, const int w1, + const int w, const int h); + +static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src, + const int w, const int h) +{ + BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6); + BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6); +} + +void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum5_lsx(sumsq, sum, tmp, w, h); + BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0); + BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); + BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h); +} + +void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], + const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, p_stride, left, lpf, w, h, edges); + coef dst0[64 * 384]; + coef dst1[64 * 384]; + + ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, ); + ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, ); + + boxsum5_lsx(sumsq0, sum0, tmp, w, h); + BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0); + BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h); + + boxsum3_lsx(sumsq0, sum0, tmp, w, h); + BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1); + BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h); + + BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0, + params->sgr.w1, w, h); +} +#endif diff --git a/third_party/dav1d/src/loongarch/mc.S b/third_party/dav1d/src/loongarch/mc.S new file mode 100644 index 0000000000..97887de4a7 --- /dev/null +++ b/third_party/dav1d/src/loongarch/mc.S @@ -0,0 +1,4758 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *const abcd, int mx, int my + HIGHBD_DECL_SUFFIX) +*/ +.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3 + vbsrl.v vr2, \in0, \in1 + vbsrl.v vr20, \in0, \in2 + addi.w t4, \in3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr1, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr29, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + vilvl.d vr2, vr20, vr2 + vilvl.d vr1, vr29, vr1 + vmulwev.h.bu.b vr3, vr2, vr1 + vmulwod.h.bu.b vr20, vr2, vr1 + vilvl.d vr2, vr20, vr3 + vhaddw.w.h vr2, vr2, vr2 + vhaddw.d.w vr2, vr2, vr2 + vhaddw.q.d vr2, vr2, vr2 + vilvh.d vr3, vr20, vr3 + vhaddw.w.h vr3, vr3, vr3 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr2, \out1 + vextrins.w \out2, vr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + vsllwil.h.b vr1, vr1, 0 + vmulwev.w.h vr3, \in2, vr1 + vmaddwod.w.h vr3, \in2, vr1 + vhaddw.d.w vr3, vr3, vr3 + vhaddw.q.d vr3, vr3, vr3 + vextrins.w \out0, vr3, \out1 +.endm + +const warp_sh +.rept 2 +.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +.endr +.rept 2 +.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +.endr +endconst + +.macro warp_lsx t, shift +function warp_affine_8x8\t\()_8bpc_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + la.local t4, warp_sh + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + vld vr30, t4, 0 + vld vr31, t4, 32 + + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, t1, a5 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30 + + vsrarni.h.w vr12, vr4, 3 + vsrarni.h.w vr13, vr5, 3 + vsrarni.h.w vr14, vr6, 3 + vsrarni.h.w vr15, vr7, 3 + vsrarni.h.w vr16, vr8, 3 + vsrarni.h.w vr17, vr9, 3 + vsrarni.h.w vr18, vr10, 3 + vsrarni.h.w vr19, vr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + FILTER_WARP_RND_P_LSX vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20 + FILTER_WARP_RND_P_LSX vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20 + FILTER_WARP_RND_P_LSX vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20 + FILTER_WARP_RND_P_LSX vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20 + + vsrarni.h.w vr21, vr4, 3 + vsrarni.h.w vr22, vr5, 3 + vsrarni.h.w vr23, vr6, 3 + vsrarni.h.w vr24, vr7, 3 + vsrarni.h.w vr25, vr8, 3 + vsrarni.h.w vr26, vr9, 3 + vsrarni.h.w vr27, vr10, 3 + vsrarni.h.w vr28, vr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + vaddi.bu vr31, vr31, 2 + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + vextrins.h vr30, vr31, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 + alsl.d a0, a1, a0, 1 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vst vr5, a0, 0 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fst.d f5, a0, 0 +.endif + + vshuf.b vr12, vr21, vr12, vr30 + vshuf.b vr13, vr22, vr13, vr30 + vshuf.b vr14, vr23, vr14, vr30 + vshuf.b vr15, vr24, vr15, vr30 + vshuf.b vr16, vr25, vr16, vr30 + vshuf.b vr17, vr26, vr17, vr30 + vshuf.b vr18, vr27, vr18, vr30 + vshuf.b vr19, vr28, vr19, vr30 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LSX t2, zero, vr12, vr4, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr13, vr4, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr14, vr4, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr15, vr4, 0x30 + FILTER_WARP_CLIP_LSX t2, t7, vr16, vr5, 0x00 + FILTER_WARP_CLIP_LSX t2, t7, vr17, vr5, 0x10 + FILTER_WARP_CLIP_LSX t2, t7, vr18, vr5, 0x20 + FILTER_WARP_CLIP_LSX t2, t7, vr19, vr5, 0x30 +.ifnb \t + vssrarni.h.w vr5, vr4, \shift + vstx vr5, a0, a1 +.else + vssrarni.hu.w vr5, vr4, \shift + vssrlni.bu.h vr5, vr5, 0 + fstx.d f5, a0, a1 +.endif + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc +.endm + +warp_lsx , 11 +warp_lsx t, 7 + +.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 + xvshuf.b xr2, \in0, \in0, \in2 + + addi.w t4, \in1, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr3, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr4, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr5, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + addi.w t4, t3, 512 + srai.w t4, t4, 10 + addi.w t4, t4, 64 + slli.w t4, t4, 3 + vldx vr6, t5, t4 + add.w t3, t3, t0 // tmx += abcd[0] + + xvinsve0.d xr3, xr5, 1 + xvinsve0.d xr3, xr4, 2 + xvinsve0.d xr3, xr6, 3 + + xvmulwev.h.bu.b xr4, xr2, xr3 + xvmulwod.h.bu.b xr5, xr2, xr3 + xvilvl.d xr2, xr5, xr4 + xvilvh.d xr3, xr5, xr4 + xvhaddw.w.h xr2, xr2, xr2 + xvhaddw.w.h xr3, xr3, xr3 + xvhaddw.d.w xr2, xr2, xr2 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr2, xr2, xr2 + xvhaddw.q.d xr3, xr3, xr3 + + xvextrins.w \out0, xr2, \out1 + xvextrins.w \out2, xr3, \out3 +.endm + +.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 + add.w \in0, \in0, \in1 + addi.w t6, \in0, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f1, t5, t6 + + add.w t2, t2, t7 + addi.w t6, t2, 512 + srai.w t6, t6, 10 + addi.w t6, t6, 64 + slli.w t6, t6, 3 + fldx.d f2, t5, t6 + + vilvl.d vr0, vr2, vr1 + vext2xv.h.b xr0, xr0 + xvmulwev.w.h xr3, \in2, xr0 + xvmaddwod.w.h xr3, \in2, xr0 + xvhaddw.d.w xr3, xr3, xr3 + xvhaddw.q.d xr3, xr3, xr3 + xvextrins.w \out0, xr3, \out1 +.endm + +const shuf0 +.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 +endconst + +.macro warp_lasx t, shift +function warp_affine_8x8\t\()_8bpc_lasx + addi.d sp, sp, -16 + ld.h t0, a4, 0 // abcd[0] + ld.h t1, a4, 2 // abcd[1] + fst.d f24, sp, 0 + fst.d f25, sp, 8 + + alsl.w t2, a3, a3, 1 + addi.w t3, a5, 0 + la.local t4, warp_sh + la.local t5, dav1d_mc_warp_filter + sub.d a2, a2, t2 + addi.d a2, a2, -3 + vld vr0, a2, 0 + xvld xr24, t4, 0 + xvld xr25, t4, 32 + la.local t2, shuf0 + xvld xr1, t2, 0 + xvpermi.q xr0, xr0, 0x00 + xvaddi.bu xr9, xr1, 4 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 + + xvsrarni.h.w xr12, xr7, 3 + xvsrarni.h.w xr13, xr8, 3 + xvsrarni.h.w xr14, xr10, 3 + xvsrarni.h.w xr15, xr11, 3 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 + + add.w a5, a5, t1 + or t3, a5, a5 + add.d a2, a2, a3 + vld vr0, a2, 0 + xvpermi.q xr0, xr0, 0x00 + FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 + FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 + + xvsrarni.h.w xr16, xr7, 3 + xvsrarni.h.w xr17, xr8, 3 + xvsrarni.h.w xr18, xr10, 3 + xvsrarni.h.w xr19, xr11, 3 + + addi.w t2, a6, 0 // my + ld.h t7, a4, 4 // abcd[2] + ld.h t8, a4, 6 // abcd[3] + +.ifnb \t + slli.d a1, a1, 1 +.endif + + // y = 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, \shift + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + + xvaddi.bu xr25, xr25, 2 + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + xvextrins.h xr24, xr25, 0x70 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 + + xvshuf.b xr12, xr16, xr12, xr24 + xvshuf.b xr13, xr17, xr13, xr24 + xvshuf.b xr14, xr18, xr14, xr24 + xvshuf.b xr15, xr19, xr15, xr24 + + add.w a6, a6, t8 + addi.w t2, a6, 0 + FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 + FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 + FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 + FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 + +.ifnb \t + xvssrarni.h.w xr21, xr20, \shift + alsl.d a0, a1, a0, 1 + xvpermi.q xr22, xr21, 0x01 + vilvl.h vr23, vr22, vr21 + vilvh.h vr21, vr22, vr21 + vst vr23, a0, 0 + vstx vr21, a0, a1 +.else + xvssrarni.hu.w xr21, xr20, 11 + xvssrlni.bu.h xr22, xr21, 0 + xvpermi.q xr23, xr22, 0x01 + vilvl.b vr21, vr23, vr22 + add.d a0, a0, a1 + fst.d f21, a0, 0 + add.d a0, a0, a1 + vstelm.d vr21, a0, 0, 1 +.endif + fld.d f24, sp, 0 + fld.d f25, sp, 8 + addi.d sp, sp, 16 +endfunc +.endm + +warp_lasx , 11 +warp_lasx t, 7 + +/* +static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, + const int w, int h, + const int weight HIGHBD_DECL_SUFFIX) +*/ + +#define bpc8_sh 5 // sh = intermediate_bits + 1 +#define bpcw8_sh 8 // sh = intermediate_bits + 4 + +#define bpc_sh bpc8_sh +#define bpcw_sh bpcw8_sh + +function avg_8bpc_lsx + addi.d t8, a0, 0 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE + add.d t1, t1, t2 // Get absolute address + jirl $r0, t1, 0 + + .align 3 +.AVG_LSX_JRTABLE: + .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE + .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE + +.AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr2, vr0, vr1 + vssrarni.bu.h vr3, vr2, bpc_sh + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LSX + b .AVG_END_LSX + +.AVG_W8_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -2 + addi.d a2, a2, 32 + vstelm.d vr5, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr5, a0, 0, 1 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W8_LSX + b .AVG_END_LSX + +.AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vssrarni.bu.h vr5, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 32 + vst vr5, a0, 0 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .AVG_W16_LSX + b .AVG_END_LSX + +.AVG_W32_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr4, a2, 32 + vld vr6, a2, 48 + vld vr1, a3, 0 + vld vr3, a3, 16 + vld vr5, a3, 32 + vld vr7, a3, 48 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vadd.h vr4, vr4, vr5 + vadd.h vr6, vr6, vr7 + vssrarni.bu.h vr2, vr0, bpc_sh + vssrarni.bu.h vr6, vr4, bpc_sh + addi.w a5, a5, -1 + addi.d a2, a2, 64 + vst vr2, a0, 0 + vst vr6, a0, 16 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LSX + b .AVG_END_LSX + +.AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W64_LSX + b .AVG_END_LSX + +.AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr2, vr3 + vssrarni.bu.h vr2, vr0, bpc_sh + addi.d a2, a2, 32 + addi.d a3, a3, 32 + vst vr2, a0, 0 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .AVG_W128_LSX +.AVG_END_LSX: +endfunc + +function avg_8bpc_lasx + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.AVG_LASX_JRTABLE: + .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE + .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE + +.AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vadd.h vr0, vr0, vr1 + vssrarni.bu.h vr1, vr0, bpc_sh + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .AVG_W4_LASX + b .AVG_END_LASX +.AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvadd.h xr2, xr0, xr1 + xvssrarni.bu.h xr1, xr2, bpc_sh + xvstelm.d xr1, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr1, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a1, a0 + blt zero, a5, .AVG_W8_LASX + b .AVG_END_LASX +.AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr2, xr5, 0xd8 + xvpermi.d xr3, xr5, 0x8d + vst vr2, a0, 0 + vstx vr3, a0, a1 + addi.w a5, a5, -2 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + alsl.d a0, a1, a0, 1 + blt zero, a5, .AVG_W16_LASX + b .AVG_END_LASX +.AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvadd.h xr4, xr0, xr1 + xvadd.h xr5, xr2, xr3 + xvssrarni.bu.h xr5, xr4, bpc_sh + xvpermi.d xr6, xr5, 0xd8 + xvst xr6, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .AVG_W32_LASX + b .AVG_END_LASX +.AVG_W64_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + addi.w a5, a5, -1 + addi.d a2, a2, 128 + addi.d a3, a3, 128 + add.d a0, a0, a1 + blt zero, a5, .AVG_W64_LASX + b .AVG_END_LASX +.AVG_W128_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr4, a2, 64 + xvld xr6, a2, 96 + xvld xr8, a2, 128 + xvld xr10, a2, 160 + xvld xr12, a2, 192 + xvld xr14, a2, 224 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvld xr5, a3, 64 + xvld xr7, a3, 96 + xvld xr9, a3, 128 + xvld xr11, a3, 160 + xvld xr13, a3, 192 + xvld xr15, a3, 224 + xvadd.h xr0, xr0, xr1 + xvadd.h xr2, xr2, xr3 + xvadd.h xr4, xr4, xr5 + xvadd.h xr6, xr6, xr7 + xvadd.h xr8, xr8, xr9 + xvadd.h xr10, xr10, xr11 + xvadd.h xr12, xr12, xr13 + xvadd.h xr14, xr14, xr15 + xvssrarni.bu.h xr2, xr0, bpc_sh + xvssrarni.bu.h xr6, xr4, bpc_sh + xvssrarni.bu.h xr10, xr8, bpc_sh + xvssrarni.bu.h xr14, xr12, bpc_sh + xvpermi.d xr1, xr2, 0xd8 + xvpermi.d xr3, xr6, 0xd8 + xvpermi.d xr5, xr10, 0xd8 + xvpermi.d xr7, xr14, 0xd8 + xvst xr1, a0, 0 + xvst xr3, a0, 32 + xvst xr5, a0, 64 + xvst xr7, a0, 96 + addi.w a5, a5, -1 + addi.d a2, a2, 256 + addi.d a3, a3, 256 + add.d a0, a0, a1 + blt zero, a5, .AVG_W128_LASX +.AVG_END_LASX: +endfunc + +function w_avg_8bpc_lsx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + vreplgr2vr.h vr21, a6 + vreplgr2vr.h vr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LSX_JRTABLE: + .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE + .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE + +.W_AVG_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LSX + b .W_AVG_END_LSX +.W_AVG_W8_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + vmulwev.w.h vr2, vr0, vr21 + vmulwod.w.h vr3, vr0, vr21 + vmaddwev.w.h vr2, vr1, vr22 + vmaddwod.w.h vr3, vr1, vr22 + vssrarni.hu.w vr3, vr2, bpcw_sh + vssrlni.bu.h vr1, vr3, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.d f0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LSX + b .W_AVG_END_LSX +.W_AVG_W16_LSX: + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LSX + b .W_AVG_END_LSX +.W_AVG_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W32_LSX + b .W_AVG_END_LSX + +.W_AVG_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LSX + b .W_AVG_END_LSX + +.W_AVG_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr2, a2, 16 + vld vr1, a3, 0 + vld vr3, a3, 16 + vmulwev.w.h vr4, vr0, vr21 + vmulwod.w.h vr5, vr0, vr21 + vmulwev.w.h vr6, vr2, vr21 + vmulwod.w.h vr7, vr2, vr21 + vmaddwev.w.h vr4, vr1, vr22 + vmaddwod.w.h vr5, vr1, vr22 + vmaddwev.w.h vr6, vr3, vr22 + vmaddwod.w.h vr7, vr3, vr22 + vssrarni.hu.w vr6, vr4, bpcw_sh + vssrarni.hu.w vr7, vr5, bpcw_sh + vssrlrni.bu.h vr7, vr6, 0 + vshuf4i.w vr8, vr7, 0x4E + vilvl.b vr0, vr8, vr7 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a0, a0, 16 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LSX +.W_AVG_END_LSX: +endfunc + +function w_avg_8bpc_lasx + addi.d t8, a0, 0 + li.w t2, 16 + sub.w t2, t2, a6 // 16 - weight + xvreplgr2vr.h xr21, a6 + xvreplgr2vr.h xr22, t2 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .W_AVG_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.W_AVG_LASX_JRTABLE: + .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE + .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE + +.W_AVG_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + xvpermi.d xr2, xr0, 0xD8 + xvpermi.d xr3, xr1, 0xD8 + xvilvl.h xr4, xr3, xr2 + xvmulwev.w.h xr0, xr4, xr21 + xvmaddwod.w.h xr0, xr4, xr22 + xvssrarni.hu.w xr1, xr0, bpcw_sh + xvssrlni.bu.h xr0, xr1, 0 + fst.s f0, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr0, a0, 0, 4 + addi.w a5, a5, -2 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + add.d a0, a1, a0 + blt zero, a5, .W_AVG_W4_LASX + b .W_AVG_END_LASX + +.W_AVG_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvstelm.d xr0, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + addi.w a5, a5, -2 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W8_LASX + b .W_AVG_END_LASX + +.W_AVG_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + xvmulwev.w.h xr2, xr0, xr21 + xvmulwod.w.h xr3, xr0, xr21 + xvmaddwev.w.h xr2, xr1, xr22 + xvmaddwod.w.h xr3, xr1, xr22 + xvssrarni.hu.w xr3, xr2, bpcw_sh + xvssrlni.bu.h xr1, xr3, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W16_LASX + b .W_AVG_END_LSX + +.W_AVG_W32_LASX: + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.w a5, a5, -1 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + add.d a0, a0, a1 + blt zero, a5, .W_AVG_W32_LASX + b .W_AVG_END_LASX + +.W_AVG_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W64_LASX + b .W_AVG_END_LASX + +.W_AVG_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr2, a2, 32 + xvld xr1, a3, 0 + xvld xr3, a3, 32 + xvmulwev.w.h xr4, xr0, xr21 + xvmulwod.w.h xr5, xr0, xr21 + xvmulwev.w.h xr6, xr2, xr21 + xvmulwod.w.h xr7, xr2, xr21 + xvmaddwev.w.h xr4, xr1, xr22 + xvmaddwod.w.h xr5, xr1, xr22 + xvmaddwev.w.h xr6, xr3, xr22 + xvmaddwod.w.h xr7, xr3, xr22 + xvssrarni.hu.w xr6, xr4, bpcw_sh + xvssrarni.hu.w xr7, xr5, bpcw_sh + xvssrlni.bu.h xr7, xr6, 0 + xvshuf4i.w xr8, xr7, 0x4E + xvilvl.b xr9, xr8, xr7 + xvpermi.d xr0, xr9, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a0, a0, 32 +.endr + + addi.w a5, a5, -1 + add.d t8, t8, a1 + add.d a0, t8, zero + blt zero, a5, .W_AVG_W128_LASX +.W_AVG_END_LASX: +endfunc + +#undef bpc_sh +#undef bpcw_sh + +#define mask_sh 10 +/* +static void mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + const uint8_t *mask HIGHBD_DECL_SUFFIX) +*/ +function mask_8bpc_lsx + vldi vr21, 0x440 // 64 + vxor.v vr19, vr19, vr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LSX_JRTABLE: + .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE + .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE + +.MASK_W4_LSX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vssrarni.hu.w vr5, vr4, mask_sh + vssrlrni.bu.h vr1, vr5, 0 + vpickod.w vr4, vr2, vr1 + vilvl.b vr0, vr4, vr1 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LSX + b .MASK_END_LSX +.MASK_W8_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + fst.d f0, a0, 0 + add.d a0, a0, a1 + vstelm.d vr0, a0, 0, 1 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LSX + b .MASK_END_LSX + +.MASK_W16_LSX: + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LSX + b .MASK_END_LSX +.MASK_W32_LSX: +.rept 2 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LSX + b .MASK_END_LSX +.MASK_W64_LSX: +.rept 4 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LSX + b .MASK_END_LSX +.MASK_W128_LSX: +.rept 8 + vld vr0, a2, 0 + vld vr10, a2, 16 + vld vr1, a3, 0 + vld vr11, a3, 16 + vld vr22, a6, 0 + vilvl.b vr2, vr19, vr22 + vilvh.b vr12, vr19, vr22 + vsub.h vr3, vr21, vr2 + vsub.h vr13, vr21, vr12 + vmulwev.w.h vr4, vr0, vr2 + vmulwod.w.h vr5, vr0, vr2 + vmulwev.w.h vr14, vr10, vr12 + vmulwod.w.h vr15, vr10, vr12 + vmaddwev.w.h vr4, vr1, vr3 + vmaddwod.w.h vr5, vr1, vr3 + vmaddwev.w.h vr14, vr11, vr13 + vmaddwod.w.h vr15, vr11, vr13 + vssrarni.hu.w vr14, vr4, mask_sh + vssrarni.hu.w vr15, vr5, mask_sh + vssrlrni.bu.h vr15, vr14, 0 + vshuf4i.w vr6, vr15, 0x4E + vilvl.b vr0, vr6, vr15 + vst vr0, a0, 0 + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + addi.d a0, a0, 16 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LSX +.MASK_END_LSX: +endfunc + +function mask_8bpc_lasx + xvldi xr21, 0x440 // 64 + xvxor.v xr19, xr19, xr19 + addi.d t8, a0, 0 + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .MASK_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t2, t0, 0 + add.d t1, t1, t2 + jirl $r0, t1, 0 + + .align 3 +.MASK_LASX_JRTABLE: + .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE + .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE + +.MASK_W4_LASX: + vld vr0, a2, 0 + vld vr1, a3, 0 + fld.d f22, a6, 0 + + vilvl.h vr4, vr1, vr0 + vilvh.h vr14, vr1, vr0 + vilvl.b vr2, vr19, vr22 + vsub.h vr3, vr21, vr2 + xvpermi.q xr14, xr4, 0x20 + vilvl.h vr5, vr3, vr2 + vilvh.h vr15, vr3, vr2 + xvpermi.q xr15, xr5, 0x20 + xvmulwev.w.h xr0, xr14, xr15 + xvmaddwod.w.h xr0, xr14, xr15 + xvssrarni.hu.w xr1, xr0, mask_sh + xvssrlni.bu.h xr2, xr1, 0 + fst.s f2, a0, 0 + add.d a0, a0, a1 + xvstelm.w xr2, a0, 0, 4 + + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a6, a6, 8 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W4_LASX + b .MASK_END_LASX + +.MASK_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + fst.d f0, a0, 0 + add.d a0, a0, a1 + xvstelm.d xr0, a0, 0, 2 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .MASK_W8_LASX + b .MASK_END_LASX + +.MASK_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + vld vr22, a6, 0 + + vext2xv.hu.bu xr2, xr22 + xvsub.h xr3, xr21, xr2 + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvssrarni.hu.w xr5, xr4, mask_sh + xvssrlni.bu.h xr1, xr5, 0 + xvpickod.w xr4, xr2, xr1 + xvilvl.b xr0, xr4, xr1 + xvpermi.d xr1, xr0, 0xD8 + vst vr1, a0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 16 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W16_LASX + b .MASK_END_LASX +.MASK_W32_LASX: + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + add.d a0, a0, a1 + addi.w a5, a5, -1 + blt zero, a5, .MASK_W32_LASX + b .MASK_END_LASX + +.MASK_W64_LASX: +.rept 2 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W64_LASX + b .MASK_END_LASX + +.MASK_W128_LASX: +.rept 4 + xvld xr0, a2, 0 + xvld xr10, a2, 32 + xvld xr1, a3, 0 + xvld xr11, a3, 32 + xvld xr22, a6, 0 + vext2xv.hu.bu xr2, xr22 + xvpermi.q xr4, xr22, 0x01 + vext2xv.hu.bu xr12, xr4 + xvsub.h xr3, xr21, xr2 + xvsub.h xr13, xr21, xr12 + + xvmulwev.w.h xr4, xr0, xr2 + xvmulwod.w.h xr5, xr0, xr2 + xvmulwev.w.h xr14, xr10, xr12 + xvmulwod.w.h xr15, xr10, xr12 + xvmaddwev.w.h xr4, xr1, xr3 + xvmaddwod.w.h xr5, xr1, xr3 + xvmaddwev.w.h xr14, xr11, xr13 + xvmaddwod.w.h xr15, xr11, xr13 + xvssrarni.hu.w xr14, xr4, mask_sh + xvssrarni.hu.w xr15, xr5, mask_sh + xvssrlni.bu.h xr15, xr14, 0 + xvshuf4i.w xr6, xr15, 0x4E + xvilvl.b xr1, xr6, xr15 + xvpermi.d xr0, xr1, 0xD8 + xvst xr0, a0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 32 + addi.d a0, a0, 32 +.endr + add.d t8, t8, a1 + add.d a0, t8, zero + addi.w a5, a5, -1 + blt zero, a5, .MASK_W128_LASX +.MASK_END_LASX: +endfunc + +/* +static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, + const int16_t *tmp1, const int16_t *tmp2, const int w, int h, + uint8_t *mask, const int sign, + const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) +*/ +function w_mask_420_8bpc_lsx + addi.d sp, sp, -24 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + vldi vr20, 0x440 + vreplgr2vr.h vr21, a7 + vldi vr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LSX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LSX_JRTABLE: + .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE + .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE + +.WMASK420_W4_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -4 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vilvl.w vr0, vr5, vr4 + vilvh.w vr1, vr5, vr4 + vilvl.w vr2, vr11, vr10 + vilvh.w vr3, vr11, vr10 + vssrarni.hu.w vr1, vr0, 10 + vssrarni.hu.w vr3, vr2, 10 + vssrlni.bu.h vr3, vr1, 0 + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 2 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 3 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vshuf4i.h vr0, vr0, 0xd8 + vhaddw.w.h vr2, vr0, vr0 + vpickev.h vr2, vr2, vr2 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LSX + b .END_W420 + +.WMASK420_W8_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a3, 0 + vld vr3, a3, 16 + addi.w a5, a5, -2 + + vabsd.h vr4, vr0, vr2 + vabsd.h vr5, vr1, vr3 + vaddi.hu vr4, vr4, 8 + vaddi.hu vr5, vr5, 8 + vsrli.h vr4, vr4, 8 + vsrli.h vr5, vr5, 8 + vadd.h vr4, vr4, vr22 + vadd.h vr5, vr5, vr22 + vmin.hu vr6, vr4, vr20 + vmin.hu vr7, vr5, vr20 + vsub.h vr8, vr20, vr6 + vsub.h vr9, vr20, vr7 + vmulwev.w.h vr4, vr6, vr0 + vmulwod.w.h vr5, vr6, vr0 + vmulwev.w.h vr10, vr7, vr1 + vmulwod.w.h vr11, vr7, vr1 + vmaddwev.w.h vr4, vr8, vr2 + vmaddwod.w.h vr5, vr8, vr2 + vmaddwev.w.h vr10, vr9, vr3 + vmaddwod.w.h vr11, vr9, vr3 + vssrarni.hu.w vr10, vr4, 10 + vssrarni.hu.w vr11, vr5, 10 + vssrlni.bu.h vr11, vr10, 0 + vshuf4i.w vr0, vr11, 0x4E + vilvl.b vr3, vr0, vr11 + vstelm.d vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr3, a0, 0, 1 + add.d a0, a0, a1 + vpickev.h vr0, vr7, vr6 + vpickod.h vr1, vr7, vr6 + vadd.h vr0, vr0, vr1 + vilvh.d vr2, vr0, vr0 + vadd.h vr2, vr2, vr0 + vsub.h vr2, vr2, vr21 + vaddi.hu vr2, vr2, 2 + vssrani.bu.h vr2, vr2, 2 + vstelm.w vr2, a6, 0, 0 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W8_LSX + b .END_W420 + +.WMASK420_W16_LSX: + vld vr0, a2, 0 + vld vr1, a2, 16 + alsl.d a2, a4, a2, 1 + vld vr2, a2, 0 + vld vr3, a2, 16 + vld vr4, a3, 0 + vld vr5, a3, 16 + alsl.d a3, a4, a3, 1 + vld vr6, a3, 0 + vld vr7, a3, 16 + + vabsd.h vr8, vr0, vr4 + vabsd.h vr9, vr1, vr5 + vabsd.h vr10, vr2, vr6 + vabsd.h vr11, vr3, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr2 + vmulwod.w.h vr24, vr14, vr2 + vmulwev.w.h vr25, vr15, vr3 + vmulwod.w.h vr26, vr15, vr3 + vmaddwev.w.h vr8, vr16, vr4 + vmaddwod.w.h vr9, vr16, vr4 + vmaddwev.w.h vr10, vr17, vr5 + vmaddwod.w.h vr11, vr17, vr5 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr0, vr11, 0x4E + vshuf4i.w vr1, vr26, 0x4E + vilvl.b vr3, vr0, vr11 + vilvl.b vr7, vr1, vr26 + vst vr3, a0, 0 + vstx vr7, a0, a1 + vpickev.h vr0, vr13, vr12 + vpickod.h vr1, vr13, vr12 + vpickev.h vr2, vr15, vr14 + vpickod.h vr3, vr15, vr14 + vadd.h vr4, vr0, vr1 + vadd.h vr5, vr2, vr3 + vadd.h vr4, vr4, vr5 + vsub.h vr4, vr4, vr21 + vssrarni.bu.h vr4, vr4, 2 + vstelm.d vr4, a6, 0, 0 + + alsl.d a2, a4, a2, 1 + alsl.d a3, a4, a3, 1 + alsl.d a0, a1, a0, 1 + addi.d a6, a6, 8 + addi.w a5, a5, -2 + blt zero, a5, .WMASK420_W16_LSX + b .END_W420 + +.WMASK420_W32_LSX: +.WMASK420_W64_LSX: +.WMASK420_W128_LSX: + +.LOOP_W32_420_LSX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 + +.W32_420_LSX: + vld vr0, t1, 0 + vld vr1, t1, 16 + vld vr2, t2, 0 + vld vr3, t2, 16 + vld vr4, t5, 0 + vld vr5, t5, 16 + vld vr6, t6, 0 + vld vr7, t6, 16 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + vabsd.h vr8, vr0, vr2 + vabsd.h vr9, vr1, vr3 + vabsd.h vr10, vr4, vr6 + vabsd.h vr11, vr5, vr7 + vaddi.hu vr8, vr8, 8 + vaddi.hu vr9, vr9, 8 + vaddi.hu vr10, vr10, 8 + vaddi.hu vr11, vr11, 8 + vsrli.h vr8, vr8, 8 + vsrli.h vr9, vr9, 8 + vsrli.h vr10, vr10, 8 + vsrli.h vr11, vr11, 8 + vadd.h vr8, vr8, vr22 + vadd.h vr9, vr9, vr22 + vadd.h vr10, vr10, vr22 + vadd.h vr11, vr11, vr22 + vmin.hu vr12, vr8, vr20 + vmin.hu vr13, vr9, vr20 + vmin.hu vr14, vr10, vr20 + vmin.hu vr15, vr11, vr20 + vsub.h vr16, vr20, vr12 + vsub.h vr17, vr20, vr13 + vsub.h vr18, vr20, vr14 + vsub.h vr19, vr20, vr15 + vmulwev.w.h vr8, vr12, vr0 + vmulwod.w.h vr9, vr12, vr0 + vmulwev.w.h vr10, vr13, vr1 + vmulwod.w.h vr11, vr13, vr1 + vmulwev.w.h vr23, vr14, vr4 + vmulwod.w.h vr24, vr14, vr4 + vmulwev.w.h vr25, vr15, vr5 + vmulwod.w.h vr26, vr15, vr5 + vmaddwev.w.h vr8, vr16, vr2 + vmaddwod.w.h vr9, vr16, vr2 + vmaddwev.w.h vr10, vr17, vr3 + vmaddwod.w.h vr11, vr17, vr3 + vmaddwev.w.h vr23, vr18, vr6 + vmaddwod.w.h vr24, vr18, vr6 + vmaddwev.w.h vr25, vr19, vr7 + vmaddwod.w.h vr26, vr19, vr7 + vssrarni.hu.w vr10, vr8, 10 + vssrarni.hu.w vr11, vr9, 10 + vssrarni.hu.w vr25, vr23, 10 + vssrarni.hu.w vr26, vr24, 10 + vssrlni.bu.h vr11, vr10, 0 + vssrlni.bu.h vr26, vr25, 0 + vshuf4i.w vr8, vr11, 0x4E + vshuf4i.w vr9, vr26, 0x4E + vilvl.b vr3, vr8, vr11 + vilvl.b vr7, vr9, vr26 + vst vr3, t3, 0 + vstx vr7, a1, t3 + addi.d t3, t3, 16 + vpickev.h vr8, vr13, vr12 + vpickod.h vr9, vr13, vr12 + vpickev.h vr10, vr15, vr14 + vpickod.h vr11, vr15, vr14 + vadd.h vr8, vr8, vr9 + vadd.h vr10, vr10, vr11 + vadd.h vr12, vr8, vr10 + vsub.h vr12, vr12, vr21 + vssrarni.bu.h vr12, vr12, 2 + vstelm.d vr12, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LSX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LSX + +.END_W420: + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + addi.d sp, sp, 24 +endfunc + +function w_mask_420_8bpc_lasx + xvldi xr20, 0x440 + xvreplgr2vr.h xr21, a7 + xvldi xr22, 0x426 + + clz.w t0, a4 + li.w t1, 24 + sub.w t0, t0, t1 + la.local t1, .WMASK420_LASX_JRTABLE + alsl.d t0, t0, t1, 1 + ld.h t8, t0, 0 + add.d t1, t1, t8 + jirl $r0, t1, 0 + + .align 3 +.WMASK420_LASX_JRTABLE: + .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE + .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE + +.WMASK420_W4_LASX: + xvld xr0, a2, 0 + xvld xr1, a3, 0 + addi.w a5, a5, -4 + + xvabsd.h xr2, xr0, xr1 + xvaddi.hu xr2, xr2, 8 + xvsrli.h xr2, xr2, 8 + xvadd.h xr2, xr2, xr22 + xvmin.hu xr3, xr2, xr20 + xvsub.h xr4, xr20, xr3 + xvmulwev.w.h xr5, xr3, xr0 + xvmulwod.w.h xr6, xr3, xr0 + xvmaddwev.w.h xr5, xr4, xr1 + xvmaddwod.w.h xr6, xr4, xr1 + xvilvl.w xr7, xr6, xr5 + xvilvh.w xr8, xr6, xr5 + xvssrarni.hu.w xr8, xr7, 10 + xvssrlni.bu.h xr9, xr8, 0 + vstelm.w vr9, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr9, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 4 + add.d a0, a0, a1 + xvstelm.w xr9, a0, 0, 5 + add.d a0, a0, a1 + + xvhaddw.w.h xr3, xr3, xr3 + xvpermi.d xr4, xr3, 0xb1 + xvadd.h xr3, xr3, xr4 + xvpickev.h xr3, xr3, xr3 + xvsub.h xr3, xr3, xr21 + xvssrarni.bu.h xr3, xr3, 2 + vstelm.h vr3, a6, 0, 0 + xvstelm.h xr3, a6, 2, 8 + + addi.d a2, a2, 32 + addi.d a3, a3, 32 + addi.d a6, a6, 4 + blt zero, a5, .WMASK420_W4_LASX + b .END_W420_LASX + +.WMASK420_W8_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -4 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr1 + xvmulwod.w.h xr13, xr7, xr1 + xvmaddwev.w.h xr10, xr8, xr2 + xvmaddwod.w.h xr11, xr8, xr2 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr1, xr13, 0x4E + xvilvl.b xr17, xr1, xr13 + vstelm.d vr17, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 2 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.d xr17, a0, 0, 3 + add.d a0, a0, a1 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.q xr9, xr8, 0x01 + vadd.h vr8, vr8, vr9 + vsub.h vr8, vr8, vr21 + vssrarni.bu.h vr8, vr8, 2 + vstelm.d vr8, a6, 0, 0 + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W8_LASX + b .END_W420_LASX + +.WMASK420_W16_LASX: + xvld xr0, a2, 0 + xvld xr1, a2, 32 + xvld xr2, a3, 0 + xvld xr3, a3, 32 + addi.w a5, a5, -2 + + xvabsd.h xr4, xr0, xr2 + xvabsd.h xr5, xr1, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr4, xr4, xr20 + xvmin.hu xr5, xr5, xr20 + xvsub.h xr6, xr20, xr4 + xvsub.h xr7, xr20, xr5 + xvmulwev.w.h xr8, xr4, xr0 + xvmulwod.w.h xr9, xr4, xr0 + xvmulwev.w.h xr10, xr5, xr1 + xvmulwod.w.h xr11, xr5, xr1 + xvmaddwev.w.h xr8, xr6, xr2 + xvmaddwod.w.h xr9, xr6, xr2 + xvmaddwev.w.h xr10, xr7, xr3 + xvmaddwod.w.h xr11, xr7, xr3 + xvssrarni.hu.w xr10, xr8, 10 + xvssrarni.hu.w xr11, xr9, 10 + xvssrlni.bu.h xr11, xr10, 0 + xvshuf4i.w xr8, xr11, 0x4E + xvilvl.b xr15, xr8, xr11 + xvpermi.d xr16, xr15, 0xd8 + vst vr16, a0, 0 + add.d a0, a0, a1 + xvpermi.q xr16, xr16, 0x01 + vst vr16, a0, 0 + add.d a0, a0, a1 + + xvhaddw.w.h xr4, xr4, xr4 + xvhaddw.w.h xr5, xr5, xr5 + xvadd.h xr4, xr5, xr4 + xvpickev.h xr6, xr4, xr4 + xvpermi.d xr7, xr6, 0x08 + vsub.h vr7, vr7, vr21 + vssrarni.bu.h vr7, vr7, 2 + vstelm.d vr7, a6, 0, 0 + + addi.d a2, a2, 64 + addi.d a3, a3, 64 + addi.d a6, a6, 8 + blt zero, a5, .WMASK420_W16_LASX + b .END_W420_LASX + +.WMASK420_W32_LASX: +.WMASK420_W64_LASX: +.WMASK420_W128_LASX: + +.LOOP_W32_420_LASX: + add.d t1, a2, zero + add.d t2, a3, zero + add.d t3, a0, zero + add.d t4, a6, zero + alsl.d t5, a4, t1, 1 + alsl.d t6, a4, t2, 1 + or t7, a4, a4 +.W32_420_LASX: + xvld xr0, t1, 0 + xvld xr1, t2, 0 + xvld xr2, t5, 0 + xvld xr3, t6, 0 + addi.d t1, t1, 32 + addi.d t2, t2, 32 + addi.d t5, t5, 32 + addi.d t6, t6, 32 + addi.w t7, t7, -16 + xvabsd.h xr4, xr0, xr1 + xvabsd.h xr5, xr2, xr3 + xvaddi.hu xr4, xr4, 8 + xvaddi.hu xr5, xr5, 8 + xvsrli.h xr4, xr4, 8 + xvsrli.h xr5, xr5, 8 + xvadd.h xr4, xr4, xr22 + xvadd.h xr5, xr5, xr22 + xvmin.hu xr6, xr4, xr20 + xvmin.hu xr7, xr5, xr20 + xvsub.h xr8, xr20, xr6 + xvsub.h xr9, xr20, xr7 + xvmulwev.w.h xr10, xr6, xr0 + xvmulwod.w.h xr11, xr6, xr0 + xvmulwev.w.h xr12, xr7, xr2 + xvmulwod.w.h xr13, xr7, xr2 + xvmaddwev.w.h xr10, xr8, xr1 + xvmaddwod.w.h xr11, xr8, xr1 + xvmaddwev.w.h xr12, xr9, xr3 + xvmaddwod.w.h xr13, xr9, xr3 + xvssrarni.hu.w xr12, xr10, 10 + xvssrarni.hu.w xr13, xr11, 10 + xvssrlni.bu.h xr13, xr12, 0 + xvshuf4i.w xr10, xr13, 0x4E + xvilvl.b xr17, xr10, xr13 + xvpermi.d xr18, xr17, 0x08 + xvpermi.d xr19, xr17, 0x0d + vst vr18, t3, 0 + vstx vr19, t3, a1 + addi.d t3, t3, 16 + + xvhaddw.w.h xr6, xr6, xr6 + xvhaddw.w.h xr7, xr7, xr7 + xvadd.h xr6, xr7, xr6 + xvpickev.h xr7, xr6, xr6 + xvpermi.d xr8, xr7, 0x08 + vsub.h vr9, vr8, vr21 + vssrarni.bu.h vr9, vr9, 2 + vstelm.d vr9, t4, 0, 0 + addi.d t4, t4, 8 + bne t7, zero, .W32_420_LASX + + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + alsl.d a0, a1, a0, 1 + srai.w t8, a4, 1 + add.d a6, a6, t8 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_W32_420_LASX + +.END_W420_LASX: +endfunc + +#undef bpc_sh +#undef bpcw_sh + +.macro vhaddw.d.h in0 + vhaddw.w.h \in0, \in0, \in0 + vhaddw.d.w \in0, \in0, \in0 +.endm +.macro vhaddw.q.w in0 + vhaddw.d.w \in0, \in0, \in0 + vhaddw.q.d \in0, \in0, \in0 +.endm +.macro PUT_H_8W in0 + vbsrl.v vr2, \in0, 1 + vbsrl.v vr3, \in0, 2 + vbsrl.v vr4, \in0, 3 + vbsrl.v vr5, \in0, 4 + vbsrl.v vr6, \in0, 5 + vbsrl.v vr7, \in0, 6 + vbsrl.v vr10, \in0, 7 + vilvl.d vr2, vr2, \in0 + vilvl.d vr3, vr4, vr3 + vilvl.d vr4, vr6, vr5 + vilvl.d vr5, vr10, vr7 + vdp2.h.bu.b \in0, vr2, vr8 + vdp2.h.bu.b vr2, vr3, vr8 + vdp2.h.bu.b vr3, vr4, vr8 + vdp2.h.bu.b vr4, vr5, vr8 + vhaddw.d.h \in0 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vhaddw.d.h vr4 + vpickev.w \in0, vr2, \in0 + vpickev.w vr2, vr4, vr3 + vpickev.h \in0, vr2, \in0 + vadd.h \in0, \in0, vr9 +.endm +.macro FILTER_8TAP_4W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vdp2.h.bu.b vr7, vr10, vr8 + vdp2.h.bu.b vr10, vr11, vr8 + vhaddw.d.h vr7 + vhaddw.d.h vr10 + vpickev.w \in0, vr10, vr7 +.endm +.macro FILTER_8TAP_8W in0 + vbsrl.v vr10, \in0, 1 + vbsrl.v vr11, \in0, 2 + vbsrl.v vr12, \in0, 3 + vbsrl.v vr13, \in0, 4 + vbsrl.v vr14, \in0, 5 + vbsrl.v vr15, \in0, 6 + vbsrl.v vr16, \in0, 7 + vilvl.d vr10, vr10, \in0 + vilvl.d vr11, vr12, vr11 + vilvl.d vr12, vr14, vr13 + vilvl.d vr13, vr16, vr15 + vdp2.h.bu.b vr14, vr10, vr8 + vdp2.h.bu.b vr15, vr11, vr8 + vdp2.h.bu.b vr16, vr12, vr8 + vdp2.h.bu.b vr17, vr13, vr8 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + vhaddw.d.h vr17 + vpickev.w vr13, vr15, vr14 + vpickev.w vr14, vr17, vr16 + vpickev.h \in0, vr14, vr13 //x0 ... x7 + vsrari.h \in0, \in0, 2 +.endm +.macro FILTER_8TAP_8W_CLIP_STORE + vdp2.w.h vr12, vr0, vr9 + vdp2.w.h vr13, vr1, vr9 + vdp2.w.h vr14, vr2, vr9 + vdp2.w.h vr15, vr3, vr9 + vdp2.w.h vr16, vr4, vr9 + vdp2.w.h vr17, vr5, vr9 + vdp2.w.h vr18, vr6, vr9 + vdp2.w.h vr19, vr7, vr9 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vhaddw.q.w vr15 + vhaddw.q.w vr16 + vhaddw.q.w vr17 + vhaddw.q.w vr18 + vhaddw.q.w vr19 + vpackev.w vr12, vr13, vr12 + vpackev.w vr13, vr15, vr14 + vpackev.d vr12, vr13, vr12 + vpackev.w vr14, vr17, vr16 + vpackev.w vr15, vr19, vr18 + vpackev.d vr13, vr15, vr14 + vssrarni.hu.w vr13, vr12, 10 + vssrani.bu.h vr13, vr13, 0 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a1 +.endm +.macro VEXTRINS_Hx8 in0 + vextrins.h vr0, \in0, 0x70 + vextrins.h vr1, \in0, 0x71 + vextrins.h vr2, \in0, 0x72 + vextrins.h vr3, \in0, 0x73 + vextrins.h vr4, \in0, 0x74 + vextrins.h vr5, \in0, 0x75 + vextrins.h vr6, \in0, 0x76 + vextrins.h vr7, \in0, 0x77 +.endm +.macro VBSRL_Vx8 + vbsrl.v vr0, vr0, 2 + vbsrl.v vr1, vr1, 2 + vbsrl.v vr2, vr2, 2 + vbsrl.v vr3, vr3, 2 + vbsrl.v vr4, vr4, 2 + vbsrl.v vr5, vr5, 2 + vbsrl.v vr6, vr6, 2 + vbsrl.v vr7, vr7, 2 +.endm + +.macro PUT_8TAP_8BPC_LSX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + slli.d t2, a3, 1 //src_stride*2 + add.d t3, t2, a3 //src_stride*3 + slli.d t4, t2, 1 //src_stride*4 + + bnez a6, .l_\lable\()put_h //mx + bnez a7, .l_\lable\()put_v //my + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv0_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv0_jtable: + .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable + .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable + +.l_\lable\()put_hv0_2w: + vldrepl.h vr0, a2, 0 + add.d a2, a2, a3 + vldrepl.h vr1, a2, 0 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr1, a0, 0, 0 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_2w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_4w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_8w: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_8w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_16w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vst vr0, a0, 0 + vstx vr1, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_16w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_32w: + vld vr0, a2, 0 + vld vr1, a2, 16 + add.d a2, a2, a3 + vld vr2, a2, 0 + vld vr3, a2, 16 + vst vr0, a0, 0 + vst vr1, a0, 16 + add.d a0, a0, a1 + vst vr2, a0, 0 + vst vr3, a0, 16 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_32w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_64w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + add.d a2, a2, a3 + vld vr4, a2, 0 + vld vr5, a2, 16 + vld vr6, a2, 32 + vld vr7, a2, 48 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + add.d a0, a0, a1 + vst vr4, a0, 0 + vst vr5, a0, 16 + vst vr6, a0, 32 + vst vr7, a0, 48 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_64w + b .l_\lable\()end_put_8tap +.l_\lable\()put_hv0_128w: + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a2, 32 + vld vr3, a2, 48 + vld vr4, a2, 64 + vld vr5, a2, 80 + vld vr6, a2, 96 + vld vr7, a2, 112 + add.d a2, a2, a3 + vld vr8, a2, 0 + vld vr9, a2, 16 + vld vr10, a2, 32 + vld vr11, a2, 48 + vld vr12, a2, 64 + vld vr13, a2, 80 + vld vr14, a2, 96 + vld vr15, a2, 112 + add.d a2, a2, a3 + vst vr0, a0, 0 + vst vr1, a0, 16 + vst vr2, a0, 32 + vst vr3, a0, 48 + vst vr4, a0, 64 + vst vr5, a0, 80 + vst vr6, a0, 96 + vst vr7, a0, 112 + add.d a0, a0, a1 + vst vr8, a0, 0 + vst vr9, a0, 16 + vst vr10, a0, 32 + vst vr11, a0, 48 + vst vr12, a0, 64 + vst vr13, a0, 80 + vst vr14, a0, 96 + vst vr15, a0, 112 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv0_128w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h: + bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_h_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + addi.d a2, a2, -3 + li.w t1, 34 + vreplgr2vr.h vr9, t1 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_h_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_h_jtable: + .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable + .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable + +.l_\lable\()put_h_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vilvl.d vr0, vr2, vr0 + vdp2.h.bu.b vr2, vr0, vr8 + vhaddw.w.h vr0, vr2, vr2 + vhaddw.d.w vr0, vr0, vr0 + vbsrl.v vr2, vr1, 1 + vilvl.d vr1, vr2, vr1 + vdp2.h.bu.b vr2, vr1, vr8 + vhaddw.w.h vr1, vr2, vr2 + vhaddw.d.w vr1, vr1, vr1 + vpickev.w vr0, vr1, vr0 + vpickev.h vr0, vr0, vr0 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_2w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr2, vr0, 1 + vbsrl.v vr3, vr0, 2 + vbsrl.v vr4, vr0, 3 + vilvl.d vr0, vr2, vr0 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr5, vr2, vr0 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + vbsrl.v vr4, vr1, 3 + vilvl.d vr0, vr2, vr1 //x0 x1 + vilvl.d vr2, vr4, vr3 //x2 x3 + vdp2.h.bu.b vr3, vr0, vr8 + vdp2.h.bu.b vr4, vr2, vr8 + vhaddw.w.h vr0, vr3, vr3 + vhaddw.d.w vr0, vr0, vr0 + vhaddw.w.h vr2, vr4, vr4 + vhaddw.d.w vr2, vr2, vr2 + vpickev.w vr6, vr2, vr0 + vpickev.h vr0, vr6, vr5 + vadd.h vr0, vr0, vr9 + vssrani.bu.h vr0, vr0, 6 + + vstelm.w vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_4w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_8w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_h_8w + b .l_\lable\()end_put_8tap + +.l_\lable\()put_h_16w: +.l_\lable\()put_h_32w: +.l_\lable\()put_h_64w: +.l_\lable\()put_h_128w: + addi.d t0, a2, 0 //src + addi.w t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_h_16w_loop: + vld vr0, a2, 0 + vldx vr1, a2, a3 + add.d a2, a2, t2 + PUT_H_8W vr0 + PUT_H_8W vr1 + vssrani.bu.h vr1, vr0, 6 + vstelm.d vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr1, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_h_16w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.w a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_h_16w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v: + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_v_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 + +.l_\lable\()put_v_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr8, t1, 0 + sub.d a2, a2, t3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_v_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_v_jtable: + .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable + .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable + +.l_\lable\()put_v_2w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr0, vr1, vr0 + +.l_\lable\()put_v_2w_loop: + fld.s f7, a2, 0 //h0 + fldx.s f10, a2, a3 //h1 + add.d a2, a2, t2 + + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vbsrl.v vr1, vr0, 1 + vextrins.b vr1, vr10, 0x70 + vextrins.b vr1, vr10, 0xf1 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vbsrl.v vr0, vr1, 1 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vpickev.w vr10, vr11, vr10 + vssrarni.hu.w vr10, vr10, 6 + vssrani.bu.h vr10, vr10, 0 + + vstelm.h vr10, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr10, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_4w: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t2 + add.d a2, a2, t3 + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + fldx.s f5, a2, t2 + fldx.s f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr0, vr1, vr0 + vilvl.h vr1, vr3, vr2 + vilvl.w vr2, vr1, vr0 + vilvh.w vr3, vr1, vr0 + +.l_\lable\()put_v_4w_loop: + fld.s f7, a2, 0 + fldx.s f10, a2, a3 + add.d a2, a2, t2 + + vextrins.b vr2, vr7, 0x70 + vextrins.b vr2, vr7, 0xf1 //x0x1(h0) + vbsrl.v vr4, vr2, 1 + vextrins.b vr4, vr10, 0x70 + vextrins.b vr4, vr10, 0xf1 //x0x1(h1) + vdp2.h.bu.b vr11, vr2, vr8 + vdp2.h.bu.b vr12, vr4, vr8 + vbsrl.v vr2, vr4, 1 + + vextrins.b vr3, vr7, 0x72 + vextrins.b vr3, vr7, 0xf3 //x2x3(h0) + vbsrl.v vr4, vr3, 1 + vextrins.b vr4, vr10, 0x72 + vextrins.b vr4, vr10, 0xf3 //x2x3(h1) + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vbsrl.v vr3, vr4, 1 + + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + + vpickev.w vr11, vr13, vr11 + vpickev.w vr12, vr14, vr12 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + vstelm.w vr11, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr11, a0, 0, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_v_8w: +.l_\lable\()put_v_16w: +.l_\lable\()put_v_32w: +.l_\lable\()put_v_64w: +.l_\lable\()put_v_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_v_8w_loop0: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t2 + add.d a2, a2, t3 + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + fldx.d f5, a2, t2 + fldx.d f6, a2, t3 + add.d a2, a2, t4 + + vilvl.b vr0, vr1, vr0 + vilvl.b vr1, vr3, vr2 + vilvl.b vr2, vr5, vr4 + vilvl.b vr3, vr7, vr6 + vilvl.h vr4, vr1, vr0 + vilvh.h vr5, vr1, vr0 + vilvl.h vr6, vr3, vr2 + vilvh.h vr7, vr3, vr2 + vilvl.w vr0, vr6, vr4 // x0x1 + vilvh.w vr1, vr6, vr4 // x2x3 + vilvl.w vr2, vr7, vr5 // x4x5 + vilvh.w vr3, vr7, vr5 // x6x7 +.l_\lable\()put_v_8w_loop: + fld.d f7, a2, 0 + fldx.d f10, a2, a3 + add.d a2, a2, t2 + //h0 + vextrins.b vr0, vr7, 0x70 + vextrins.b vr0, vr7, 0xf1 + vextrins.b vr1, vr7, 0x72 + vextrins.b vr1, vr7, 0xf3 + vextrins.b vr2, vr7, 0x74 + vextrins.b vr2, vr7, 0xf5 + vextrins.b vr3, vr7, 0x76 + vextrins.b vr3, vr7, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + //h1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + vextrins.b vr0, vr10, 0x70 + vextrins.b vr0, vr10, 0xf1 + vextrins.b vr1, vr10, 0x72 + vextrins.b vr1, vr10, 0xf3 + vextrins.b vr2, vr10, 0x74 + vextrins.b vr2, vr10, 0xf5 + vextrins.b vr3, vr10, 0x76 + vextrins.b vr3, vr10, 0xf7 + vdp2.h.bu.b vr11, vr0, vr8 + vdp2.h.bu.b vr12, vr1, vr8 + vdp2.h.bu.b vr13, vr2, vr8 + vdp2.h.bu.b vr14, vr3, vr8 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vpickev.w vr11, vr12, vr11 + vpickev.w vr12, vr14, vr13 + vpickev.h vr11, vr12, vr11 + vssrarni.bu.h vr11, vr11, 6 + fst.d f11, a0, 0 + add.d a0, a0, a1 + vbsrl.v vr0, vr0, 1 + vbsrl.v vr1, vr1, 1 + vbsrl.v vr2, vr2, 1 + vbsrl.v vr3, vr3, 1 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_v_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_v_8w_loop0 + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv: + ld.d t5, sp, 0 //filter_type + andi t1, t5, 3 + blt t0, a4, .l_\lable\()put_hv_idx_fh + andi t1, t5, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + vldrepl.d vr8, t1, 0 + ld.d t1, sp, 0 //filter_type + srli.w t1, t1, 2 + blt t0, a5, .l_\lable\()put_hv_idx_fv + andi t1, t1, 1 + addi.w t1, t1, 3 +.l_\lable\()put_hv_idx_fv: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a7, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fv's offset + vldrepl.d vr9, t1, 0 + vexth.h.b vr9, vr9 + + sub.d a2, a2, t3 + addi.d a2, a2, -3 + + clz.w t1, a4 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()put_hv_jtable + alsl.d t1, t1, t5, 3 + ld.d t6, t1, 0 + add.d t5, t5, t6 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()put_hv_jtable: + .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable + .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable + +.l_\lable\()put_hv_2w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + + vbsrl.v vr10, vr0, 1 + vbsrl.v vr11, vr1, 1 + vbsrl.v vr12, vr2, 1 + vbsrl.v vr13, vr3, 1 + vbsrl.v vr14, vr4, 1 + vbsrl.v vr15, vr5, 1 + vbsrl.v vr16, vr6, 1 + vilvl.d vr0, vr10, vr0 + vilvl.d vr1, vr11, vr1 + vilvl.d vr2, vr12, vr2 + vilvl.d vr3, vr13, vr3 + vilvl.d vr4, vr14, vr4 + vilvl.d vr5, vr15, vr5 + vilvl.d vr6, vr16, vr6 + vdp2.h.bu.b vr10, vr0, vr8 + vdp2.h.bu.b vr11, vr1, vr8 + vdp2.h.bu.b vr12, vr2, vr8 + vdp2.h.bu.b vr13, vr3, vr8 + vdp2.h.bu.b vr14, vr4, vr8 + vdp2.h.bu.b vr15, vr5, vr8 + vdp2.h.bu.b vr16, vr6, vr8 + vhaddw.d.h vr10 + vhaddw.d.h vr11 + vhaddw.d.h vr12 + vhaddw.d.h vr13 + vhaddw.d.h vr14 + vhaddw.d.h vr15 + vhaddw.d.h vr16 + + vpackev.w vr10, vr11, vr10 + vpackev.w vr12, vr13, vr12 + vpackod.d vr11, vr12, vr10 + vpackev.d vr10, vr12, vr10 + + vpackev.w vr12, vr15, vr14 + vpackev.w vr16, vr17, vr16 + vpackod.d vr13, vr16, vr12 + vpackev.d vr12, vr16, vr12 + + vpickev.h vr10, vr12, vr10 //0 1 2 3 4 5 6 * (h0) + vpickev.h vr11, vr13, vr11 //8 9 10 11 12 13 14 * (h1) + vsrari.h vr10, vr10, 2 + vsrari.h vr11, vr11, 2 +.l_\lable\()put_hv_2w_loop: + vld vr7, a2, 0 + vldx vr12, a2, a3 + add.d a2, a2, t2 + + vbsrl.v vr1, vr7, 1 + vbsrl.v vr2, vr12, 1 + vilvl.d vr0, vr1, vr7 + vilvl.d vr1, vr2, vr12 + vdp2.h.bu.b vr2, vr0, vr8 + vdp2.h.bu.b vr3, vr1, vr8 + vhaddw.d.h vr2 + vhaddw.d.h vr3 + vpickev.w vr2, vr3, vr2 + vpickev.h vr2, vr2, vr2 + vsrari.h vr2, vr2, 2 + vextrins.h vr10, vr2, 0x70 //0 1 2 3 4 5 6 7 + vextrins.h vr11, vr2, 0x71 + vbsrl.v vr12, vr10, 2 + vbsrl.v vr13, vr11, 2 + vextrins.h vr12, vr2, 0x72 //1 2 3 4 5 6 7 8 + vextrins.h vr13, vr2, 0x73 + vdp2.w.h vr0, vr10, vr9 + vdp2.w.h vr1, vr11, vr9 + vdp2.w.h vr2, vr12, vr9 + vdp2.w.h vr3, vr13, vr9 + vhaddw.q.w vr0 + vhaddw.q.w vr1 + vhaddw.q.w vr2 + vhaddw.q.w vr3 + vpackev.w vr0, vr1, vr0 + vpackev.w vr1, vr3, vr2 + vpackev.d vr0, vr1, vr0 + vssrarni.hu.w vr0, vr0, 10 + vssrani.bu.h vr0, vr0, 0 + vbsrl.v vr10, vr12, 2 + vbsrl.v vr11, vr13, 2 + vstelm.h vr0, a0, 0, 0 + add.d a0, a0, a1 + vstelm.h vr0, a0, 0, 1 + add.d a0, a0, a1 + addi.d a5, a5, -2 + bnez a5, .l_\lable\()put_hv_2w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_4w: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_4W vr0 //x0 x1 x2 x3 + FILTER_8TAP_4W vr1 + FILTER_8TAP_4W vr2 + FILTER_8TAP_4W vr3 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + FILTER_8TAP_4W vr6 + vpackev.h vr0, vr1, vr0 + vpackev.h vr1, vr3, vr2 + vpackev.h vr2, vr5, vr4 + vpackev.h vr3, vr7, vr6 + vilvl.w vr4, vr1, vr0 + vilvh.w vr5, vr1, vr0 + vilvl.w vr6, vr3, vr2 + vilvh.w vr7, vr3, vr2 + vilvl.d vr0, vr6, vr4 //0 1 2 3 4 5 6 * + vilvh.d vr1, vr6, vr4 + vilvl.d vr2, vr7, vr5 + vilvh.d vr3, vr7, vr5 + vsrari.h vr0, vr0, 2 + vsrari.h vr1, vr1, 2 + vsrari.h vr2, vr2, 2 + vsrari.h vr3, vr3, 2 +.l_\lable\()put_hv_4w_loop: + vld vr4, a2, 0 + vldx vr5, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_4W vr4 + FILTER_8TAP_4W vr5 + vpickev.h vr4, vr5, vr4 + vsrari.h vr4, vr4, 2 + vextrins.h vr0, vr4, 0x70 + vextrins.h vr1, vr4, 0x71 + vextrins.h vr2, vr4, 0x72 + vextrins.h vr3, vr4, 0x73 + vbsrl.v vr5, vr0, 2 + vbsrl.v vr6, vr1, 2 + vbsrl.v vr7, vr2, 2 + vbsrl.v vr10, vr3, 2 + vextrins.h vr5, vr4, 0x74 + vextrins.h vr6, vr4, 0x75 + vextrins.h vr7, vr4, 0x76 + vextrins.h vr10, vr4, 0x77 + vdp2.w.h vr11, vr0, vr9 + vdp2.w.h vr12, vr1, vr9 + vdp2.w.h vr13, vr2, vr9 + vdp2.w.h vr14, vr3, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr0, vr12, vr11 + vpackev.w vr1, vr14, vr13 + vpackev.d vr0, vr1, vr0 + vdp2.w.h vr11, vr5, vr9 + vdp2.w.h vr12, vr6, vr9 + vdp2.w.h vr13, vr7, vr9 + vdp2.w.h vr14, vr10, vr9 + vhaddw.q.w vr11 + vhaddw.q.w vr12 + vhaddw.q.w vr13 + vhaddw.q.w vr14 + vpackev.w vr1, vr12, vr11 + vpackev.w vr2, vr14, vr13 + vpackev.d vr1, vr2, vr1 + vssrarni.hu.w vr1, vr0, 10 + vssrani.bu.h vr1, vr1, 0 + vstelm.w vr1, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + add.d a0, a0, a1 + vbsrl.v vr0, vr5, 2 + vbsrl.v vr1, vr6, 2 + vbsrl.v vr2, vr7, 2 + vbsrl.v vr3, vr10, 2 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_4w_loop + b .l_\lable\()end_put_8tap + +.l_\lable\()put_hv_8w: +.l_\lable\()put_hv_16w: +.l_\lable\()put_hv_32w: +.l_\lable\()put_hv_64w: +.l_\lable\()put_hv_128w: + addi.d t0, a2, 0 //src + addi.d t5, a5, 0 //h + addi.d t8, a0, 0 //dst +.l_\lable\()put_hv_8w_loop0: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t2 + add.d a2, a2, t3 + vld vr3, a2, 0 + vldx vr4, a2, a3 + vldx vr5, a2, t2 + vldx vr6, a2, t3 + add.d a2, a2, t4 + FILTER_8TAP_8W vr0 + FILTER_8TAP_8W vr1 + FILTER_8TAP_8W vr2 + FILTER_8TAP_8W vr3 + FILTER_8TAP_8W vr4 + FILTER_8TAP_8W vr5 + FILTER_8TAP_8W vr6 + LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\ + vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17 +.l_\lable\()put_hv_8w_loop: + vld vr20, a2, 0 + vldx vr21, a2, a3 + add.d a2, a2, t2 + FILTER_8TAP_8W vr20 + FILTER_8TAP_8W vr21 + VEXTRINS_Hx8 vr20 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + VEXTRINS_Hx8 vr21 + FILTER_8TAP_8W_CLIP_STORE + VBSRL_Vx8 + addi.w a5, a5, -2 + bnez a5, .l_\lable\()put_hv_8w_loop + addi.d a2, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a5, t5, 0 + addi.w a4, a4, -8 + bnez a4, .l_\lable\()put_hv_8w_loop0 +.l_\lable\()end_put_8tap: +.endm + +function put_8tap_regular_8bpc_lsx + addi.d sp, sp, -16 + st.d zero, sp, 0 + PUT_8TAP_8BPC_LSX 0 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 1 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 1 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_regular_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 2 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 2 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 4 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 4 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 5 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 5 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_smooth_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 6 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 6 + addi.d sp, sp, 16 +endfunc + +function put_8tap_regular_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 8 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 8 + addi.d sp, sp, 16 +endfunc + +function put_8tap_smooth_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 9 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 9 + addi.d sp, sp, 16 +endfunc + +function put_8tap_sharp_8bpc_lsx + addi.d sp, sp, -16 + li.w t0, 10 + st.d t0, sp, 0 + PUT_8TAP_8BPC_LSX 10 + addi.d sp, sp, 16 +endfunc + +const shufb1 +.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 +endconst + +.macro SHUFB in0, in1, tmp, out + xvbsrl.v \tmp, \in0, 2 + xvpermi.q \tmp, \in0, 0x20 + xvshuf.b \out, \tmp, \tmp, \in1 +.endm + +.macro HADDWDH in0 + xvhaddw.w.h \in0, \in0, \in0 + xvhaddw.d.w \in0, \in0, \in0 +.endm + +.macro HADDWQW in0 + xvhaddw.d.w \in0, \in0, \in0 + xvhaddw.q.d \in0, \in0, \in0 +.endm + +.macro PREP_W16_H in0 + xvbsrl.v xr4, \in0, 4 + xvbsrl.v xr5, \in0, 8 + xvpermi.q xr9, \in0, 0x31 + xvpackev.d xr5, xr9, xr5 + xvbsrl.v xr6, xr5, 4 + SHUFB \in0, xr23, xr9, \in0 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + xvdp2.h.bu.b xr10, \in0, xr22 + xvdp2.h.bu.b xr11, xr4, xr22 + xvdp2.h.bu.b xr12, xr5, xr22 + xvdp2.h.bu.b xr13, xr6, xr22 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h \in0, xr10, 2 +.endm + +.macro PREP_8TAP_8BPC_LASX lable + li.w t0, 4 + la.local t6, dav1d_mc_subpel_filters + la.local t7, shufb1 + xvld xr23, t7, 0 + slli.d t2, a2, 1 //src_stride*2 + add.d t3, t2, a2 //src_stride*3 + slli.d t4, t2, 1 + + bnez a5, .l_\lable\()h //mx + bnez a6, .l_\lable\()v + + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_hv0_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_hv0_jtable: + .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_64w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_32w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_16w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_8w - .l_\lable\()prep_hv0_jtable + .hword .l_\lable\()hv0_4w - .l_\lable\()prep_hv0_jtable + +.l_\lable\()hv0_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + fldx.s f3, a1, t3 + add.d a1, a1, t4 + xvpackev.w xr0, xr1, xr0 + xvpackev.w xr1, xr3, xr2 + xvpermi.q xr0, xr1, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_4w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_8w: + fld.d f0, a1, 0 + fldx.d f1, a1, a2 + fldx.d f2, a1, t2 + fldx.d f3, a1, t3 + add.d a1, a1, t4 + xvpermi.q xr0, xr1, 0x02 + xvpermi.q xr2, xr3, 0x02 + xvsllwil.hu.bu xr0, xr0, 4 + xvsllwil.hu.bu xr2, xr2, 4 + xvst xr0, a0, 0 + xvst xr2, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_8w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_16w: + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t2 + vldx vr3, a1, t3 + add.d a1, a1, t4 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + xvslli.h xr0, xr0, 4 + xvslli.h xr1, xr1, 4 + xvslli.h xr2, xr2, 4 + xvslli.h xr3, xr3, 4 + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + addi.d a0, a0, 128 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_16w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_32w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + xvst xr1, a0, 64 + xvst xr5, a0, 96 + xvst xr2, a0, 128 + xvst xr6, a0, 160 + xvst xr3, a0, 192 + xvst xr7, a0, 224 + addi.d a0, a0, 256 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32w + b .l_\lable\()end_pre_8tap +.l_\lable\()hv0_64w: +.l_\lable\()hv0_128w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 5 + slli.w t7, t7, 6 + addi.d t8, a0, 0 +.l_\lable\()hv0_32_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvpermi.d xr4, xr0, 0xD8 + xvpermi.d xr5, xr1, 0xD8 + xvpermi.d xr6, xr2, 0xD8 + xvpermi.d xr7, xr3, 0xD8 + xvpermi.d xr10, xr0, 0x32 + xvpermi.d xr11, xr1, 0x32 + xvpermi.d xr12, xr2, 0x32 + xvpermi.d xr13, xr3, 0x32 + xvsllwil.hu.bu xr0, xr4, 4 + xvsllwil.hu.bu xr1, xr5, 4 + xvsllwil.hu.bu xr2, xr6, 4 + xvsllwil.hu.bu xr3, xr7, 4 + xvsllwil.hu.bu xr4, xr10, 4 + xvsllwil.hu.bu xr5, xr11, 4 + xvsllwil.hu.bu xr6, xr12, 4 + xvsllwil.hu.bu xr7, xr13, 4 + xvst xr0, a0, 0 + xvst xr4, a0, 32 + add.d t1, a0, t7 + xvst xr1, t1, 0 + xvst xr5, t1, 32 + add.d t1, t1, t7 + xvst xr2, t1, 0 + xvst xr6, t1, 32 + add.d t1, t1, t7 + xvst xr3, t1, 0 + xvst xr7, t1, 32 + add.d a0, t1, t7 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv0_32_loop + addi.d a1, t0, 32 + addi.d t0, t0, 32 + addi.d a0, t8, 64 + addi.d t8, t8, 64 + addi.d a4, t5, 0 + addi.d a3, a3, -32 + bnez a3, .l_\lable\()hv0_32_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()h: + bnez a6, .l_\lable\()hv //if(fh) && if (fv) + + andi t1, a7, 3 + blt t0, a3, .l_\lable\()h_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()h_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + + addi.d a1, a1, -3 + clz.w t1, a3 + li.w t5, 24 + sub.w t1, t1, t5 + la.local t5, .l_\lable\()prep_h_jtable + alsl.d t1, t1, t5, 1 + ld.h t8, t1, 0 + add.d t5, t5, t8 + jirl $r0, t5, 0 + + .align 3 +.l_\lable\()prep_h_jtable: + .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_64w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_32w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_16w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_8w - .l_\lable\()prep_h_jtable + .hword .l_\lable\()h_4w - .l_\lable\()prep_h_jtable + +.l_\lable\()h_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpickev.w xr10, xr12, xr10 + xvpickev.w xr14, xr16, xr14 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpickev.h xr10, xr14, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_4w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_8w: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + add.d a1, a1, t4 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr10 + SHUFB xr1, xr23, xr9, xr11 + SHUFB xr2, xr23, xr9, xr12 + SHUFB xr3, xr23, xr9, xr13 + SHUFB xr4, xr23, xr9, xr14 + SHUFB xr5, xr23, xr9, xr15 + SHUFB xr6, xr23, xr9, xr16 + SHUFB xr7, xr23, xr9, xr17 + + xvdp2.h.bu.b xr0, xr10, xr22 + xvdp2.h.bu.b xr1, xr11, xr22 + xvdp2.h.bu.b xr2, xr12, xr22 + xvdp2.h.bu.b xr3, xr13, xr22 + xvdp2.h.bu.b xr4, xr14, xr22 + xvdp2.h.bu.b xr5, xr15, xr22 + xvdp2.h.bu.b xr6, xr16, xr22 + xvdp2.h.bu.b xr7, xr17, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpickev.w xr0, xr1, xr0 + xvpickev.w xr2, xr3, xr2 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr2, xr2, 0xd8 + xvpickev.h xr0, xr2, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvsrari.h xr0, xr0, 2 + + xvpickev.w xr4, xr5, xr4 + xvpickev.w xr6, xr7, xr6 + xvpermi.d xr4, xr4, 0xd8 + xvpermi.d xr6, xr6, 0xd8 + xvpickev.h xr4, xr6, xr4 + xvpermi.d xr4, xr4, 0xd8 + xvsrari.h xr4, xr4, 2 + + xvst xr0, a0, 0 + xvst xr4, a0, 32 + addi.d a0, a0, 64 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_8w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_16w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvst xr1, a0, 32 + xvst xr2, a0, 64 + xvst xr3, a0, 96 + + addi.d a0, a0, 128 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()h_16w + b .l_\lable\()end_pre_8tap + +.l_\lable\()h_32w: +.l_\lable\()h_64w: +.l_\lable\()h_128w: + addi.d t0, a1, 0 //src + addi.d t5, a4, 0 //h + srli.w t7, a3, 4 //w + slli.w t7, t7, 5 //store offset + addi.d t8, a0, 0 //dst +.l_\lable\()h_16_loop: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + + PREP_W16_H xr0 + PREP_W16_H xr1 + PREP_W16_H xr2 + PREP_W16_H xr3 + + xvst xr0, a0, 0 + xvstx xr1, a0, t7 + slli.w t1, t7, 1 + xvstx xr2, a0, t1 + add.w t1, t1, t7 + xvstx xr3, a0, t1 + slli.w t1, t7, 2 + add.d a0, a0, t1 + addi.d a4, a4, -4 + bnez a4, .l_\lable\()h_16_loop + + addi.d a1, t0, 16 + addi.d t0, t0, 16 + addi.d a0, t8, 32 + addi.d t8, t8, 32 + addi.d a4, t5, 0 + addi.d a3, a3, -16 + bnez a3, .l_\lable\()h_16_loop + b .l_\lable\()end_pre_8tap +.l_\lable\()hv: + andi t1, a7, 3 + blt t0, a3, .l_\lable\()hv_idx_fh + andi t1, a7, 1 + addi.w t1, t1, 3 +.l_\lable\()hv_idx_fh: + addi.w t5, zero, 120 + mul.w t1, t1, t5 + addi.w t5, a5, -1 + slli.w t5, t5, 3 + add.w t1, t1, t5 + add.d t1, t6, t1 //fh's offset + xvldrepl.d xr22, t1, 0 + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()hv_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()hv_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + xvsllwil.h.b xr8, xr8, 0 + + sub.d a1, a1, t3 + addi.d a1, a1, -3 + beq a3, t0, .l_\lable\()hv_4w + b .l_\lable\()hv_8w +.l_\lable\()hv_4w: + xvld xr0, a1, 0 + xvldx xr1, a1, a2 + xvldx xr2, a1, t2 + xvldx xr3, a1, t3 + add.d a1, a1, t4 + xvld xr4, a1, 0 + xvldx xr5, a1, a2 + xvldx xr6, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr11 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr12 //h2 + HADDWDH xr13 //h3 + + xvpackev.w xr10, xr11, xr10 + xvpackev.w xr12, xr13, xr12 + xvpackev.d xr11, xr12, xr10 + xvpackod.d xr10, xr12, xr10 + xvpickev.h xr11, xr10, xr11 + xvsrari.h xr11, xr11, 2 + + HADDWDH xr14 //h4 + HADDWDH xr15 //h5 + HADDWDH xr16 //h6 + + xvpackev.w xr14, xr15, xr14 + xvpackev.w xr16, xr17, xr16 + xvpackev.d xr17, xr16, xr14 + xvpackod.d xr14, xr16, xr14 + xvpickev.h xr13, xr14, xr17 + xvsrari.h xr13, xr13, 2 + + xvpackev.d xr18, xr13, xr11 //0 4 8 12 16 20 24 * 2 6 10 14 18 22 26 * + xvpackod.d xr19, xr13, xr11 //1 5 9 13 17 21 25 * 3 7 11 15 19 23 27 * +.l_\lable\()hv_w4_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr1, a1, 0 + xvldx xr2, a1, a2 + xvldx xr3, a1, t2 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr12, xr1, xr22 + xvdp2.h.bu.b xr14, xr2, xr22 + xvdp2.h.bu.b xr16, xr3, xr22 + + HADDWDH xr10 //h0 mid0 mid1 mid2 mid3 + HADDWDH xr12 //h1 mid4 mid5 mid6 mid7 + HADDWDH xr14 //h2 + HADDWDH xr16 //h3 + + xvpackev.w xr10, xr12, xr10 + xvpackev.w xr14, xr16, xr14 + xvpackev.d xr12, xr14, xr10 + xvpackod.d xr10, xr14, xr10 + xvpickev.h xr12, xr10, xr12 + xvsrari.h xr12, xr12, 2 + + xvextrins.h xr18, xr12, 0x70 //0 4 8 12 16 20 24 0(x0) 2 6 10 14 18 22 26 2(x2) + xvextrins.h xr19, xr12, 0x74 //1 5 9 13 17 21 25 0(x1) 3 7 11 15 19 23 27 2(x3) + + xvdp2.w.h xr0, xr18, xr8 + xvdp2.w.h xr2, xr19, xr8 + HADDWQW xr0 + HADDWQW xr2 + xvpackev.w xr0, xr2, xr0 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x71 + xvextrins.h xr19, xr12, 0x75 + xvdp2.w.h xr2, xr18, xr8 + xvdp2.w.h xr4, xr19, xr8 + HADDWQW xr2 + HADDWQW xr4 + xvpackev.w xr2, xr4, xr2 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x72 + xvextrins.h xr19, xr12, 0x76 + xvdp2.w.h xr4, xr18, xr8 + xvdp2.w.h xr9, xr19, xr8 + HADDWQW xr4 + HADDWQW xr9 + xvpackev.w xr4, xr9, xr4 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + xvextrins.h xr18, xr12, 0x73 + xvextrins.h xr19, xr12, 0x77 + xvdp2.w.h xr9, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + HADDWQW xr9 + HADDWQW xr11 + xvpackev.w xr9, xr11, xr9 + + xvpackev.d xr0, xr2, xr0 + xvpackev.d xr4, xr9, xr4 + xvsrari.w xr0, xr0, 6 + xvsrari.w xr4, xr4, 6 + xvpermi.d xr0, xr0, 0xd8 + xvpermi.d xr4, xr4, 0xd8 + xvpickev.h xr0, xr4, xr0 + xvpermi.d xr0, xr0, 0xd8 + xvst xr0, a0, 0 + addi.d a0, a0, 32 + + xvbsrl.v xr18, xr18, 2 + xvbsrl.v xr19, xr19, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_w4_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()hv_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 3 + slli.w t7, t7, 4 // store offset + addi.d t8, a0, 0 +.l_\lable\()hv_8w_loop0: + xvld xr0, a1, 0 + xvldx xr2, a1, a2 + xvldx xr4, a1, t2 + xvldx xr6, a1, t3 + + add.d a1, a1, t4 + xvld xr10, a1, 0 + xvldx xr11, a1, a2 + xvldx xr12, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr13 + SHUFB xr1, xr23, xr9, xr14 + SHUFB xr2, xr23, xr9, xr15 + SHUFB xr3, xr23, xr9, xr16 + SHUFB xr4, xr23, xr9, xr17 + SHUFB xr5, xr23, xr9, xr18 + SHUFB xr6, xr23, xr9, xr19 + SHUFB xr7, xr23, xr9, xr20 + + xvdp2.h.bu.b xr0, xr13, xr22 + xvdp2.h.bu.b xr1, xr14, xr22 + xvdp2.h.bu.b xr2, xr15, xr22 + xvdp2.h.bu.b xr3, xr16, xr22 + xvdp2.h.bu.b xr4, xr17, xr22 + xvdp2.h.bu.b xr5, xr18, xr22 + xvdp2.h.bu.b xr6, xr19, xr22 + xvdp2.h.bu.b xr7, xr20, xr22 + + HADDWDH xr0 + HADDWDH xr1 + HADDWDH xr2 + HADDWDH xr3 + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + + xvpackev.w xr0, xr2, xr0 + xvpackev.w xr2, xr6, xr4 + xvpackev.d xr16, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr16 + xvsrari.h xr0, xr0, 2 // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 + + xvpackev.w xr1, xr3, xr1 + xvpackev.w xr3, xr7, xr5 + xvpackev.d xr16, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr16 + xvsrari.h xr1, xr1, 2 // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31 + + xvbsrl.v xr13, xr10, 4 + xvbsrl.v xr14, xr11, 4 + xvbsrl.v xr15, xr12, 4 + + SHUFB xr10, xr23, xr9, xr10 + SHUFB xr13, xr23, xr9, xr13 + SHUFB xr11, xr23, xr9, xr11 + SHUFB xr14, xr23, xr9, xr14 + SHUFB xr12, xr23, xr9, xr12 + SHUFB xr15, xr23, xr9, xr15 + + xvdp2.h.bu.b xr4, xr10, xr22 + xvdp2.h.bu.b xr5, xr13, xr22 + xvdp2.h.bu.b xr6, xr11, xr22 + xvdp2.h.bu.b xr7, xr14, xr22 + xvdp2.h.bu.b xr9, xr12, xr22 + xvdp2.h.bu.b xr10, xr15, xr22 + + HADDWDH xr4 + HADDWDH xr5 + HADDWDH xr6 + HADDWDH xr7 + HADDWDH xr9 + HADDWDH xr10 + + xvpackev.w xr4, xr6, xr4 + xvpackev.w xr9, xr12, xr9 + xvpackev.d xr16, xr9, xr4 + xvpackod.d xr11, xr9, xr4 + xvpickev.h xr2, xr11, xr16 + xvsrari.h xr2, xr2, 2 // 32 40 48 * 33 41 49 * 34 42 50 * 35 43 51 * + + xvpackev.w xr5, xr7, xr5 + xvpackev.w xr10, xr12, xr10 + xvpackev.d xr16, xr10, xr5 + xvpackod.d xr11, xr10, xr5 + xvpickev.h xr3, xr11, xr16 + xvsrari.h xr3, xr3, 2 // 36 44 52 * 37 45 53 * 38 46 54 * 39 47 56 * + + xvpackev.d xr18, xr2, xr0 // 0 8 16 24 32 40 48 * 2 10 18 26 34 42 50 * + xvpackod.d xr19, xr2, xr0 // 1 9 17 25 33 41 49 * 3 11 19 27 35 43 51 * + xvpackev.d xr20, xr3, xr1 // 4 12 20 28 36 44 52 * 6 14 22 30 38 46 54 * + xvpackod.d xr21, xr3, xr1 // 5 13 21 29 37 45 53 * 7 15 23 31 39 47 55 * + +.l_\lable\()hv_8w_loop: + xvldx xr0, a1, t3 + add.d a1, a1, t4 + xvld xr2, a1, 0 + xvldx xr4, a1, a2 + xvldx xr6, a1, t2 + + xvbsrl.v xr1, xr0, 4 + xvbsrl.v xr3, xr2, 4 + xvbsrl.v xr5, xr4, 4 + xvbsrl.v xr7, xr6, 4 + + SHUFB xr0, xr23, xr9, xr0 + SHUFB xr1, xr23, xr9, xr1 + SHUFB xr2, xr23, xr9, xr2 + SHUFB xr3, xr23, xr9, xr3 + SHUFB xr4, xr23, xr9, xr4 + SHUFB xr5, xr23, xr9, xr5 + SHUFB xr6, xr23, xr9, xr6 + SHUFB xr7, xr23, xr9, xr7 + + xvdp2.h.bu.b xr10, xr0, xr22 + xvdp2.h.bu.b xr11, xr1, xr22 + xvdp2.h.bu.b xr12, xr2, xr22 + xvdp2.h.bu.b xr13, xr3, xr22 + xvdp2.h.bu.b xr14, xr4, xr22 + xvdp2.h.bu.b xr15, xr5, xr22 + xvdp2.h.bu.b xr16, xr6, xr22 + xvdp2.h.bu.b xr17, xr7, xr22 + + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + HADDWDH xr14 + HADDWDH xr15 + HADDWDH xr16 + HADDWDH xr17 + + xvpackev.w xr0, xr12, xr10 + xvpackev.w xr2, xr16, xr14 + xvpackev.d xr9, xr2, xr0 + xvpackod.d xr0, xr2, xr0 + xvpickev.h xr0, xr0, xr9 + xvsrari.h xr0, xr0, 2 // 56 64 72 80 57 65 73 81 58 66 74 82 59 67 75 83 + + xvpackev.w xr1, xr13, xr11 + xvpackev.w xr3, xr17, xr15 + xvpackev.d xr9, xr3, xr1 + xvpackod.d xr1, xr3, xr1 + xvpickev.h xr1, xr1, xr9 + xvsrari.h xr1, xr1, 2 // 60 68 76 84 61 69 77 85 62 70 78 86 63 71 79 87 + + xvextrins.h xr18, xr0, 0x70 // 0 8 16 24 32 40 48 (56) 2 10 18 26 34 42 50 (58) + xvextrins.h xr19, xr0, 0x74 // 1 9 17 25 33 41 49 (57) 3 11 19 27 35 43 51 (59) + xvextrins.h xr20, xr1, 0x70 + xvextrins.h xr21, xr1, 0x74 + + //h - 1 + xvdp2.w.h xr10, xr18, xr8 + xvdp2.w.h xr11, xr19, xr8 + xvdp2.w.h xr12, xr20, xr8 + xvdp2.w.h xr13, xr21, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr2, xr11, xr10 //0 1 * * 2 3 * * + xvpackev.w xr3, xr13, xr12 //4 5 * * 6 7 * * + xvpackev.d xr2, xr3, xr2 //0 1 4 5 2 3 6 7 + //h - 2 + xvbsrl.v xr4, xr18, 2 + xvbsrl.v xr5, xr19, 2 + xvbsrl.v xr6, xr20, 2 + xvbsrl.v xr7, xr21, 2 + xvextrins.h xr4, xr0, 0x71 + xvextrins.h xr5, xr0, 0x75 + xvextrins.h xr6, xr1, 0x71 + xvextrins.h xr7, xr1, 0x75 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr14, xr11, xr10 + xvpackev.w xr15, xr13, xr12 + xvpackev.d xr14, xr15, xr14 //8 9 12 13 10 11 14 15 + //h - 3 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x72 + xvextrins.h xr5, xr0, 0x76 + xvextrins.h xr6, xr1, 0x72 + xvextrins.h xr7, xr1, 0x76 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr15, xr11, xr10 + xvpackev.w xr16, xr13, xr12 + xvpackev.d xr15, xr16, xr15 //16 17 20 21 18 19 22 23 + //h - 4 + xvbsrl.v xr4, xr4, 2 + xvbsrl.v xr5, xr5, 2 + xvbsrl.v xr6, xr6, 2 + xvbsrl.v xr7, xr7, 2 + xvextrins.h xr4, xr0, 0x73 + xvextrins.h xr5, xr0, 0x77 + xvextrins.h xr6, xr1, 0x73 + xvextrins.h xr7, xr1, 0x77 + + xvdp2.w.h xr10, xr4, xr8 + xvdp2.w.h xr11, xr5, xr8 + xvdp2.w.h xr12, xr6, xr8 + xvdp2.w.h xr13, xr7, xr8 + + HADDWQW xr10 + HADDWQW xr11 + HADDWQW xr12 + HADDWQW xr13 + + xvpackev.w xr16, xr11, xr10 + xvpackev.w xr17, xr13, xr12 + xvpackev.d xr16, xr17, xr16 //24 25 28 29 26 27 30 31 + + xvsrari.w xr2, xr2, 6 + xvsrari.w xr14, xr14, 6 + xvsrari.w xr15, xr15, 6 + xvsrari.w xr16, xr16, 6 + + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr14, xr14, 0xd8 + xvpermi.d xr15, xr15, 0xd8 + xvpermi.d xr16, xr16, 0xd8 + xvpickev.h xr2, xr14, xr2 + xvpickev.h xr3, xr16, xr15 + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr3, xr3, 0xd8 + + xvpermi.q xr10, xr2, 0x31 + xvpermi.q xr11, xr3, 0x31 + + vst vr2, a0, 0 + vstx vr10, a0, t7 //32 + slli.w t1, t7, 1 //64 + vstx vr3, a0, t1 + add.w t1, t1, t7 //96 + vstx vr11, a0, t1 + slli.w t1, t7, 2 //128 + add.d a0, a0, t1 + + xvbsrl.v xr18, xr4, 2 + xvbsrl.v xr19, xr5, 2 + xvbsrl.v xr20, xr6, 2 + xvbsrl.v xr21, xr7, 2 + + addi.d a4, a4, -4 + bnez a4, .l_\lable\()hv_8w_loop + + addi.d a1, t0, 8 + addi.d t0, t0, 8 + addi.d a0, t8, 16 + addi.d t8, t8, 16 + addi.d a4, t5, 0 + addi.d a3, a3, -8 + bnez a3, .l_\lable\()hv_8w_loop0 + b .l_\lable\()end_pre_8tap +.l_\lable\()v: + + srli.w a7, a7, 2 + blt t0, a4, .l_\lable\()v_idx_fv + andi a7, a7, 1 + addi.w a7, a7, 3 +.l_\lable\()v_idx_fv: + addi.w t5, zero, 120 + mul.w a7, a7, t5 + addi.w t5, a6, -1 + slli.w t5, t5, 3 + add.w a7, a7, t5 + add.d a7, t6, a7 //fv's offset + xvldrepl.d xr8, a7, 0 + + sub.d a1, a1, t3 + beq a3, t0, .l_\lable\()v_4w + blt t0, a3, .l_\lable\()v_8w +.l_\lable\()v_4w: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_4w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvst xr10, a0, 0 + addi.d a0, a0, 32 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_4w_loop + b .l_\lable\()end_pre_8tap + +.l_\lable\()v_8w: + addi.d t0, a1, 0 + addi.d t5, a4, 0 + srli.w t7, a3, 2 + slli.w t7, t7, 3 + addi.d t8, a0, 0 +.l_\lable\()v_8w_loop0: + fld.s f0, a1, 0 + fldx.s f1, a1, a2 + fldx.s f2, a1, t2 + add.d a1, a1, t3 + fld.s f3, a1, 0 + fldx.s f4, a1, a2 + fldx.s f5, a1, t2 + fldx.s f6, a1, t3 + + xvilvl.b xr0, xr1, xr0 // 0 1 8 9 16 17 24 25 + xvilvl.b xr1, xr3, xr2 // 2 3 10 11 18 19 26 27 + xvilvl.b xr2, xr5, xr4 // 4 5 12 13 20 21 28 29 + xvilvl.b xr3, xr7, xr6 // 6 7 14 15 22 23 30 31 + xvilvl.h xr0, xr1, xr0 // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 + xvilvl.h xr1, xr3, xr2 // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + xvilvl.w xr2, xr1, xr0 + xvilvh.w xr0, xr1, xr0 + xvpermi.q xr0, xr2, 0x20 + +.l_\lable\()v_8w_loop: + add.d a1, a1, t4 + fld.s f7, a1, 0 //h0 + fldx.s f10, a1, a2 //h1 + fldx.s f11, a1, t2 //h2 + fldx.s f12, a1, t3 //h3 + + xvbsrl.v xr9, xr7, 2 + xvpermi.q xr9, xr7, 0x20 + xvextrins.b xr0, xr9, 0x70 + xvextrins.b xr0, xr9, 0xf1 + + xvbsrl.v xr1, xr0, 1 + xvbsrl.v xr7, xr10, 2 + xvpermi.q xr7, xr10, 0x20 + xvextrins.b xr1, xr7, 0x70 + xvextrins.b xr1, xr7, 0xf1 + + xvbsrl.v xr2, xr1, 1 + xvbsrl.v xr7, xr11, 2 + xvpermi.q xr7, xr11, 0x20 + xvextrins.b xr2, xr7, 0x70 + xvextrins.b xr2, xr7, 0xf1 + + xvbsrl.v xr3, xr2, 1 + xvbsrl.v xr7, xr12, 2 + xvpermi.q xr7, xr12, 0x20 + xvextrins.b xr3, xr7, 0x70 + xvextrins.b xr3, xr7, 0xf1 + xvbsrl.v xr4, xr3, 1 + + xvdp2.h.bu.b xr10, xr0, xr8 + xvdp2.h.bu.b xr11, xr1, xr8 + xvdp2.h.bu.b xr12, xr2, xr8 + xvdp2.h.bu.b xr13, xr3, xr8 + HADDWDH xr10 + HADDWDH xr11 + HADDWDH xr12 + HADDWDH xr13 + xvpickev.w xr10, xr11, xr10 + xvpickev.w xr11, xr13, xr12 + xvpermi.d xr10, xr10, 0xd8 + xvpermi.d xr11, xr11, 0xd8 + xvpickev.h xr10, xr11, xr10 + xvpermi.d xr10, xr10, 0xd8 + xvsrari.h xr10, xr10, 2 + + xvaddi.bu xr0, xr4, 0 + + xvstelm.d xr10, a0, 0, 0 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 1 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 2 + add.d a0, a0, t7 + xvstelm.d xr10, a0, 0, 3 + add.d a0, a0, t7 + addi.w a4, a4, -4 + bnez a4, .l_\lable\()v_8w_loop + + addi.d a1, t0, 4 + addi.d t0, t0, 4 + addi.d a0, t8, 8 + addi.d t8, t8, 8 + addi.d a4, t5, 0 + addi.d a3, a3, -4 + bnez a3, .l_\lable\()v_8w_loop0 + +.l_\lable\()end_pre_8tap: +.endm + +function prep_8tap_regular_8bpc_lasx + addi.w a7, zero, 0 + PREP_8TAP_8BPC_LASX 0 +endfunc + +function prep_8tap_smooth_regular_8bpc_lasx + addi.w a7, zero, 1 + PREP_8TAP_8BPC_LASX 1 +endfunc + +function prep_8tap_sharp_regular_8bpc_lasx + addi.w a7, zero, 2 + PREP_8TAP_8BPC_LASX 2 +endfunc + +function prep_8tap_regular_smooth_8bpc_lasx + addi.w a7, zero, 4 + PREP_8TAP_8BPC_LASX 4 +endfunc + +function prep_8tap_smooth_8bpc_lasx + addi.w a7, zero, 5 + PREP_8TAP_8BPC_LASX 5 +endfunc + +function prep_8tap_sharp_smooth_8bpc_lasx + addi.w a7, zero, 6 + PREP_8TAP_8BPC_LASX 6 +endfunc + +function prep_8tap_regular_sharp_8bpc_lasx + addi.w a7, zero, 8 + PREP_8TAP_8BPC_LASX 8 +endfunc + +function prep_8tap_smooth_sharp_8bpc_lasx + addi.w a7, zero, 9 + PREP_8TAP_8BPC_LASX 9 +endfunc + +function prep_8tap_sharp_8bpc_lasx + addi.w a7, zero, 10 + PREP_8TAP_8BPC_LASX 10 +endfunc diff --git a/third_party/dav1d/src/loongarch/mc.h b/third_party/dav1d/src/loongarch/mc.h new file mode 100644 index 0000000000..c64b7efc2b --- /dev/null +++ b/third_party/dav1d/src/loongarch/mc.h @@ -0,0 +1,118 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_MC_H +#define DAV1D_SRC_LOONGARCH_MC_H + +#include "config.h" +#include "src/mc.h" +#include "src/cpu.h" + +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + +decl_avg_fn(BF(dav1d_avg, lsx)); +decl_w_avg_fn(BF(dav1d_w_avg, lsx)); +decl_mask_fn(BF(dav1d_mask, lsx)); +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx)); +decl_w_mask_fn(BF(dav1d_w_mask_420, lsx)); + +decl_mc_fn(BF(dav1d_put_8tap_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx)); + +decl_avg_fn(BF(dav1d_avg, lasx)); +decl_w_avg_fn(BF(dav1d_w_avg, lasx)); +decl_mask_fn(BF(dav1d_mask, lasx)); +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx)); +decl_w_mask_fn(BF(dav1d_w_mask_420, lasx)); + +decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx)); + +static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) { +#if BITDEPTH == 8 + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + c->avg = BF(dav1d_avg, lsx); + c->w_avg = BF(dav1d_w_avg, lsx); + c->mask = BF(dav1d_mask, lsx); + c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx); + c->w_mask[2] = BF(dav1d_w_mask_420, lsx); + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return; + + c->avg = BF(dav1d_avg, lasx); + c->w_avg = BF(dav1d_w_avg, lasx); + c->mask = BF(dav1d_mask, lasx); + c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx); + c->w_mask[2] = BF(dav1d_w_mask_420, lasx); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx); +#endif +} + +#endif /* DAV1D_SRC_LOONGARCH_MC_H */ diff --git a/third_party/dav1d/src/loongarch/msac.S b/third_party/dav1d/src/loongarch/msac.S new file mode 100644 index 0000000000..c371eba4de --- /dev/null +++ b/third_party/dav1d/src/loongarch/msac.S @@ -0,0 +1,368 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "loongson_asm.S" + +const min_prob + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 +endconst + +.macro decode_symbol_adapt w + addi.d sp, sp, -48 + addi.d a4, a0, 24 + vldrepl.h vr0, a4, 0 //rng + fst.s f0, sp, 0 //val==0 + vld vr1, a1, 0 //cdf +.if \w == 16 + li.w t4, 16 + vldx vr11, a1, t4 +.endif + addi.d a6, a0, 16 + vldrepl.d vr2, a6, 0 //dif + addi.d t0, a0, 32 + ld.w t1, t0, 0 //allow_update_cdf + la.local t2, min_prob + addi.d t2, t2, 32 + addi.w t3, a2, 1 + slli.w t3, t3, 1 + sub.d t2, t2, t3 + vld vr3, t2, 0 //min_prob +.if \w == 16 + vldx vr13, t2, t4 +.endif + vsrli.h vr4, vr0, 8 //r = s->rng >> 8 + vslli.h vr4, vr4, 8 //r << 8 + vsrli.h vr5, vr1, 6 + vslli.h vr5, vr5, 7 +.if \w == 16 + vsrli.h vr15, vr11, 6 + vslli.h vr15, vr15, 7 +.endif + vmuh.hu vr5, vr4, vr5 + vadd.h vr5, vr5, vr3 //v +.if \w == 16 + vmuh.hu vr15, vr4, vr15 + vadd.h vr15, vr15, vr13 +.endif + addi.d t8, sp, 4 + vst vr5, t8, 0 //store v +.if \w == 16 + vstx vr15, t8, t4 +.endif + vreplvei.h vr20, vr2, 3 //c + vssub.hu vr6, vr5, vr20 //c >=v + vseqi.h vr6, vr6, 0 +.if \w == 16 + vssub.hu vr16, vr15, vr20 //c >=v + vseqi.h vr16, vr16, 0 + vpickev.b vr21, vr16, vr6 +.endif +.if \w <= 8 + vmskltz.h vr10, vr6 +.else + vmskltz.b vr10, vr21 +.endif + beqz t1, .renorm\()\w + + // update_cdf + alsl.d t1, a2, a1, 1 + ld.h t2, t1, 0 //count + srli.w t3, t2, 4 //count >> 4 + addi.w t3, t3, 4 + li.w t5, 2 + sltu t5, t5, a2 + add.w t3, t3, t5 //rate + sltui t5, t2, 32 + add.w t2, t2, t5 //count + (count < 32) + vreplgr2vr.h vr9, t3 + vseq.h vr7, vr7, vr7 + vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 + vsub.h vr5, vr5, vr1 + vsub.h vr8, vr1, vr6 +.if \w == 16 + vavgr.hu vr15, vr16, vr7 + vsub.h vr15, vr15, vr11 + vsub.h vr18, vr11, vr16 +.endif + vsra.h vr5, vr5, vr9 + vadd.h vr8, vr8, vr5 +.if \w == 4 + fst.d f8, a1, 0 +.else + vst vr8, a1, 0 +.endif +.if \w == 16 + vsra.h vr15, vr15, vr9 + vadd.h vr18, vr18, vr15 + vstx vr18, a1, t4 +.endif + st.h t2, t1, 0 + +.renorm\()\w: + vpickve2gr.h t3, vr10, 0 + ctz.w a7, t3 // ret + alsl.d t3, a7, t8, 1 + ld.hu t4, t3, 0 // v + addi.d t3, t3, -2 + ld.hu t5, t3, 0 // u + sub.w t5, t5, t4 // rng + slli.d t4, t4, 48 + vpickve2gr.d t6, vr2, 0 + sub.d t6, t6, t4 // dif + addi.d t6, t6, 1 + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + addi.d a5, a0, 28 // cnt + ld.w t7, a5, 0 + sub.w t7, t7, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a4, 0 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a5, 0 // store cnt + st.d t6, a6, 0 // store dif + move a0, a7 + addi.d sp, sp, 48 +.endm + +function msac_decode_symbol_adapt4_lsx + decode_symbol_adapt 4 +endfunc + +function msac_decode_symbol_adapt8_lsx + decode_symbol_adapt 8 +endfunc + +function msac_decode_symbol_adapt16_lsx + decode_symbol_adapt 16 +endfunc + +function msac_decode_bool_lsx + ld.w t0, a0, 24 // rng + srli.w a1, a1, 6 + ld.d t1, a0, 16 // dif + srli.w t2, t0, 8 // r >> 8 + mul.w t2, t2, a1 + ld.w a5, a0, 28 // cnt + addi.d t1, t1, 1 // dif + 1 + srli.w t2, t2, 1 + addi.w t2, t2, 4 // v + slli.d t3, t2, 48 // vw + sltu t4, t1, t3 + move t8, t4 // ret + xori t4, t4, 1 + maskeqz t6, t3, t4 // if (ret) vw + sub.d t6, t1, t6 // dif + slli.w t5, t2, 1 + sub.w t5, t0, t5 // r - 2v + maskeqz t7, t5, t4 // if (ret) r - 2v + add.w t5, t2, t7 // v(rng) + + // renorm + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + sub.w t7, a5, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a0, 24 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a0, 28 // store cnt + st.d t6, a0, 16 // store dif + move a0, t8 +endfunc + +function msac_decode_bool_adapt_lsx + ld.hu a3, a1, 0 // cdf[0] /f + ld.w t0, a0, 24 // rng + ld.d t1, a0, 16 // dif + srli.w t2, t0, 8 // r >> 8 + srli.w a7, a3, 6 + mul.w t2, t2, a7 + ld.w a4, a0, 32 // allow_update_cdf + ld.w a5, a0, 28 // cnt + srli.w t2, t2, 1 + addi.w t2, t2, 4 // v + slli.d t3, t2, 48 // vw + sltu t4, t1, t3 + move t8, t4 // bit + xori t4, t4, 1 + maskeqz t6, t3, t4 // if (ret) vw + sub.d t6, t1, t6 // dif + slli.w t5, t2, 1 + sub.w t5, t0, t5 // r - 2v + maskeqz t7, t5, t4 // if (ret) r - 2v + add.w t5, t2, t7 // v(rng) + beqz a4, .renorm + + // update_cdf + ld.hu t0, a1, 2 // cdf[1] + srli.w t1, t0, 4 + addi.w t1, t1, 4 // rate + sltui t2, t0, 32 // count < 32 + add.w t0, t0, t2 // count + (count < 32) + sub.w a3, a3, t8 // cdf[0] -= bit + slli.w t4, t8, 15 + sub.w t7, a3, t4 // cdf[0] - bit - 32768 + sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate + sub.w t7, a3, t7 // cdf[0] + st.h t7, a1, 0 + st.h t0, a1, 2 + +.renorm: + // renorm + addi.d t6, t6, 1 + clz.w t4, t5 // d + xori t4, t4, 16 // d + sll.d t6, t6, t4 + addi.d t6, t6, -1 // dif + sub.w t7, a5, t4 // cnt-d + sll.w t5, t5, t4 + st.w t5, a0, 24 // store rng + bge t7, zero, 9f + + // refill + ld.d t0, a0, 0 // buf_pos + addi.d t1, a0, 8 + ld.d t1, t1, 0 // buf_end + addi.d t2, t0, 8 + blt t1, t2, 1f + + ld.d t0, t0, 0 // next_bits + addi.w t3, t7, 23 // shift_bits = cnt + 23 + addi.w t7, t7, 16 // cnt += 16 + revb.d t0, t0 // next_bits = bswap(next_bits) + srli.w t4, t3, 3 + sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3 + st.d t2, a0, 0 + andi t3, t3, 24 // shift_bits &= 24 + srl.d t0, t0, t3 // next_bits >>= shift_bits + sub.w t3, t3, t7 // shift_bits -= 16 + cnt + sll.d t0, t0, t3 // next_bits <<= shift_bits + li.w t5, 48 + sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits + xor t6, t6, t0 // dif ^= next_bits + b 9f +1: + li.w t4, 40 + sub.w t5, t4, t7 // c = 40 - cnt +2: + bge t0, t1, 3f + ld.bu t2, t0, 0 + addi.d t0, t0, 1 + sll.d t2, t2, t5 + xor t6, t6, t2 + addi.w t5, t5, -8 + bge t5, zero, 2b + // refill_eob_end +3: + st.d t0, a0, 0 // s->buf_pos = buf_pos + sub.w t7, t4, t5 // cnt = 40 - c +9: + st.w t7, a0, 28 // store cnt + st.d t6, a0, 16 // store dif + move a0, t8 +endfunc diff --git a/third_party/dav1d/src/loongarch/msac.h b/third_party/dav1d/src/loongarch/msac.h new file mode 100644 index 0000000000..fdcff838bb --- /dev/null +++ b/third_party/dav1d/src/loongarch/msac.h @@ -0,0 +1,46 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_MSAC_H +#define DAV1D_SRC_LOONGARCH_MSAC_H + +unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f); + +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx + +#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */ diff --git a/third_party/dav1d/src/loongarch/refmvs.S b/third_party/dav1d/src/loongarch/refmvs.S new file mode 100644 index 0000000000..63a83d3ce7 --- /dev/null +++ b/third_party/dav1d/src/loongarch/refmvs.S @@ -0,0 +1,152 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/loongarch/loongson_asm.S" + +/* +static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, + const int bx4, const int bw4, int bh4) +*/ + +function splat_mv_lsx + vld vr0, a1, 0 // 0 1 ... 11 ... + clz.w t4, a3 + vaddi.bu vr1, vr0, 0 + addi.w t4, t4, -26 + vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3 + la.local t5, .SPLAT_LSX_JRTABLE + vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0 + alsl.d t6, t4, t5, 1 + vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7 + ld.h t7, t6, 0 + vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0 + add.d t8, t5, t7 + alsl.d a2, a2, a2, 1 + vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 + slli.w a2, a2, 2 + jirl $r0, t8, 0 + +.SPLAT_LSX_JRTABLE: + .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE + .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE + +.SPLAT_W1_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + fst.d f1, t3, 0 + fst.s f3, t3, 8 + blt zero, a4, .SPLAT_W1_LSX + b .splat_end +.SPLAT_W2_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + fst.d f2, t3, 16 + blt zero, a4, .SPLAT_W2_LSX + b .splat_end + +.SPLAT_W4_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + blt zero, a4, .SPLAT_W4_LSX + b .splat_end + +.SPLAT_W8_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + blt zero, a4, .SPLAT_W8_LSX + b .splat_end + +.SPLAT_W16_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + +.rept 2 + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + + addi.d t3, t3, 96 +.endr + + blt zero, a4, .SPLAT_W16_LSX + b .splat_end + +.SPLAT_W32_LSX: + ld.d t3, a0, 0 + addi.d a0, a0, 8 + addi.d a4, a4, -1 + add.d t3, t3, a2 + +.rept 4 + vst vr1, t3, 0 + vst vr2, t3, 16 + vst vr3, t3, 32 + + vst vr1, t3, 48 + vst vr2, t3, 64 + vst vr3, t3, 80 + + addi.d t3, t3, 96 +.endr + + blt zero, a4, .SPLAT_W32_LSX + +.splat_end: +endfunc diff --git a/third_party/dav1d/src/loongarch/refmvs.h b/third_party/dav1d/src/loongarch/refmvs.h new file mode 100644 index 0000000000..60ff435c81 --- /dev/null +++ b/third_party/dav1d/src/loongarch/refmvs.h @@ -0,0 +1,44 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Loongson Technology Corporation Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H +#define DAV1D_SRC_LOONGARCH_REFMVS_H + +#include "src/cpu.h" +#include "src/refmvs.h" + +decl_splat_mv_fn(dav1d_splat_mv_lsx); + +static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return; + + c->splat_mv = dav1d_splat_mv_lsx; +} + +#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */ -- cgit v1.2.3