/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2023, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function inv_txfm_add_4x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) addi t0, t0, 8 vle16.v v2, (t0) addi t0, t0, 8 vle16.v v3, (t0) jalr t0, a4 vmv.v.x v4, zero vsseg4e16.v v0, (a2) vle16.v v0, (a2) vse16.v v4, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v2, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v3, (t0) vse16.v v4, (t0) jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 itx_4x4_end: vsetvli zero, zero, e8, mf4, ta, ma vle8.v v4, (a0) add t0, a0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, mf2, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 vse8.v v4, (a0) add a0, a0, a1 vse8.v v5, (a0) add a0, a0, a1 vse8.v v6, (a0) add a0, a0, a1 vse8.v v7, (a0) ret endfunc function inv_identity_e16_x4_rvv, export=1, ext=v li t1, (5793-4096)*8 vsmul.vx v4, v0, t1 vsmul.vx v5, v1, t1 vsmul.vx v6, v2, t1 vsmul.vx v7, v3, t1 vsadd.vv v0, v0, v4 vsadd.vv v1, v1, v5 vsadd.vv v2, v2, v6 vsadd.vv v3, v3, v7 jr t0 endfunc .macro iwht_4 vadd.vv v0, v0, v1 vsub.vv v5, v2, v3 vsub.vv v4, v0, v5 vsra.vi v4, v4, 1 vsub.vv v2, v4, v1 vsub.vv v1, v4, v3 vadd.vv v3, v5, v2 vsub.vv v0, v0, v1 .endm .macro idct_4 o0, o1, o2, o3 li t1, 2896 li t2, 1567 li t3, 3784 vwmul.vx v16, \o0, t1 vwmul.vx v18, \o0, t1 vwmacc.vx v16, t1, \o2 neg t1, t1 vwmacc.vx v18, t1, \o2 vwmul.vx v20, \o1, t3 neg t3, t3 vwmul.vx v22, \o1, t2 vwmacc.vx v20, t2, \o3 vwmacc.vx v22, t3, \o3 li t1, 2048 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vsadd.vv \o0, v16, v20 vsadd.vv \o1, v18, v22 vssub.vv \o2, v18, v22 vssub.vv \o3, v16, v20 .endm .macro iadst_4 o0, o1, o2, o3, lm2, lm li t1, 1321 li t2, 3803 li t3, 2482 vwmul.vx v16, v0, t1 vwmul.vx v18, v0, t3 neg t1, t1 vwmacc.vx v16, t2, v2 vwmacc.vx v18, t1, v2 neg t2, t2 vwmacc.vx v16, t3, v3 vwmacc.vx v18, t2, v3 vwsub.vv v20, v0, v2 vwadd.wv v20, v20, v3 li t1, 3344 vwmul.vx v22, v1, t1 vsetvli zero, zero, e32, \lm2, ta, ma vmul.vx v20, v20, t1 vadd.vv v24, v16, v18 vadd.vv v16, v16, v22 vadd.vv v18, v18, v22 vsub.vv v22, v24, v22 li t1, 2048 vadd.vx v16, v16, t1 vadd.vx v18, v18, t1 vadd.vx v20, v20, t1 vadd.vx v22, v22, t1 vsetvli zero, zero, e16, \lm, ta, ma vnsra.wi \o0, v16, 12 vnsra.wi \o1, v18, 12 vnsra.wi \o2, v20, 12 vnsra.wi \o3, v22, 12 .endm function inv_dct_e16_x4_rvv, export=1, ext=v idct_4 v0, v1, v2, v3 jr t0 endfunc function inv_adst_e16_x4_rvv, export=1, ext=v iadst_4 v0, v1, v2, v3, m1, mf2 jr t0 endfunc function inv_flipadst_e16_x4_rvv, export=1, ext=v iadst_4 v3, v2, v1, v0, m1, mf2 jr t0 endfunc function inv_adst_e16_x4w_rvv, export=1, ext=v iadst_4 v0, v1, v2, v3, m2, m1 jr t0 endfunc function inv_flipadst_e16_x4w_rvv, export=1, ext=v iadst_4 v3, v2, v1, v0, m2, m1 jr t0 endfunc function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) addi t0, t0, 8 vle16.v v2, (t0) addi t0, t0, 8 vle16.v v3, (t0) vsra.vi v0, v0, 2 vsra.vi v1, v1, 2 vsra.vi v2, v2, 2 vsra.vi v3, v3, 2 iwht_4 vmv.v.x v4, zero vsseg4e16.v v0, (a2) vle16.v v0, (a2) vse16.v v4, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v2, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v3, (t0) vse16.v v4, (t0) iwht_4 j itx_4x4_end endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct beqz a3, 1f .endif la a4, inv_\txfm1\()_e16_x4_rvv la a5, inv_\txfm2\()_e16_x4_rvv j inv_txfm_add_4x4_rvv .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma ld t2, (a2) li t1, 2896*8 vmv.v.x v0, t2 vsmul.vx v0, v0, t1 sd x0, (a2) vsmul.vx v0, v0, t1 vssra.vi v0, v0, 4 vmv.v.v v1, v0 vmv.v.v v2, v0 vmv.v.v v3, v0 j itx_4x4_end .endif endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) addi t0, t0, 16 vle16.v v4, (t0) addi t0, t0, 16 vle16.v v5, (t0) addi t0, t0, 16 vle16.v v6, (t0) addi t0, t0, 16 vle16.v v7, (t0) .ifc \variant, identity_ // The identity vsadd.vv and downshift vssra.vi 1 cancel out j L(itx_8x8_epilog) .else jalr t0, a4 vssra.vi v0, v0, 1 vssra.vi v1, v1, 1 vssra.vi v2, v2, 1 vssra.vi v3, v3, 1 vssra.vi v4, v4, 1 vssra.vi v5, v5, 1 vssra.vi v6, v6, 1 vssra.vi v7, v7, 1 L(itx_8x8_epilog): vsseg8e16.v v0, (a2) vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) addi t0, t0, 16 vle16.v v4, (t0) addi t0, t0, 16 vle16.v v5, (t0) addi t0, t0, 16 vle16.v v6, (t0) addi t0, t0, 16 vle16.v v7, (t0) jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vssra.vi v4, v4, 4 vssra.vi v5, v5, 4 vssra.vi v6, v6, 4 vssra.vi v7, v7, 4 li t1, 64 vsetvli zero, t1, e16, m8, ta, ma vmv.v.x v8, zero vse16.v v8, (a2) itx_8x8_end: vsetivli zero, 8, e8, mf2, ta, ma vle8.v v8, (a0) add t0, a0, a1 vle8.v v9, (t0) add t0, t0, a1 vle8.v v10, (t0) add t0, t0, a1 vle8.v v11, (t0) add t0, t0, a1 vle8.v v12, (t0) add t0, t0, a1 vle8.v v13, (t0) add t0, t0, a1 vle8.v v14, (t0) add t0, t0, a1 vle8.v v15, (t0) vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vmax.vx v4, v4, zero vmax.vx v5, v5, zero vmax.vx v6, v6, zero vmax.vx v7, v7, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 vse8.v v8, (a0) add a0, a0, a1 vse8.v v9, (a0) add a0, a0, a1 vse8.v v10, (a0) add a0, a0, a1 vse8.v v11, (a0) add a0, a0, a1 vse8.v v12, (a0) add a0, a0, a1 vse8.v v13, (a0) add a0, a0, a1 vse8.v v14, (a0) add a0, a0, a1 vse8.v v15, (a0) ret .endif endfunc .endm def_fn_8x8_base identity_ def_fn_8x8_base function inv_identity_e16_x8_rvv, export=1, ext=v vsadd.vv v0, v0, v0 vsadd.vv v1, v1, v1 vsadd.vv v2, v2, v2 vsadd.vv v3, v3, v3 vsadd.vv v4, v4, v4 vsadd.vv v5, v5, v5 vsadd.vv v6, v6, v6 vsadd.vv v7, v7, v7 jr t0 endfunc .macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 idct_4 \o0, \o2, \o4, \o6 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 vwmul.vx v22, \o1, t2 neg t2, t2 vwmul.vx v16, \o1, t1 vwmacc.vx v22, t1, \o7 vwmacc.vx v16, t2, \o7 vwmul.vx v20, \o5, t4 neg t4, t4 vwmul.vx v18, \o5, t3 vwmacc.vx v20, t3, \o3 vwmacc.vx v18, t4, \o3 li t1, 2048 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vssub.vv \o7, v22, v20 vsadd.vv v22, v22, v20 vssub.vv \o1, v16, v18 vsadd.vv v16, v16, v18 li t2, 2896 vwmul.vx v18, \o7, t2 vwmul.vx v20, \o7, t2 vwmacc.vx v20, t2, \o1 neg t2, t2 vwmacc.vx v18, t2, \o1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vssub.vv \o7, \o0, v22 vsadd.vv \o0, \o0, v22 vssub.vv v17, \o2, v20 vsadd.vv \o1, \o2, v20 vssub.vv \o5, \o4, v18 vsadd.vv \o2, \o4, v18 vssub.vv \o4, \o6, v16 vsadd.vv \o3, \o6, v16 vmv.v.v \o6, v17 .endm .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 li t1, 4076 li t2, 401 li t3, 3612 li t4, 1931 li t5, 2598 li t6, 3166 vwmul.vx v16, v7, t1 neg t1, t1 vwmul.vx v18, v7, t2 vwmacc.vx v16, t2, v0 vwmacc.vx v18, t1, v0 vwmul.vx v20, v5, t3 neg t3, t3 vwmul.vx v22, v5, t4 vwmacc.vx v20, t4, v2 vwmacc.vx v22, t3, v2 vwmul.vx v24, v3, t5 neg t5, t5 vwmul.vx v26, v3, t6 vwmacc.vx v24, t6, v4 vwmacc.vx v26, t5, v4 li t1, 2048 li t2, 1189 li t3, 3920 li t4, 1567 li t5, 3784 li t6, 2896 vwmul.vx v28, v1, t2 neg t2, t2 vwmul.vx v30, v1, t3 vwmacc.vx v28, t3, v6 vwmacc.vx v30, t2, v6 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v24, v24, 12 vnsra.wi v26, v26, 12 vnsra.wi v28, v28, 12 vnsra.wi v30, v30, 12 vssub.vv v4, v16, v24 vsadd.vv v16, v16, v24 vsadd.vv v1, v18, v26 vsadd.vv v2, v20, v28 vsadd.vv v3, v22, v30 vssub.vv v5, v18, v26 vssub.vv v6, v20, v28 vssub.vv v30, v22, v30 vsadd.vv \o0, v16, v2 vsadd.vv \o7, v1, v3 vssub.vv v2, v16, v2 vssub.vv v3, v1, v3 vwmul.vx v16, v4, t5 vwmul.vx v18, v4, t4 vwmul.vx v20, v30, t5 vwmul.vx v22, v30, t4 vwmacc.vx v16, t4, v5 neg t4, t4 vwmacc.vx v22, t5, v6 neg t5, t5 vwmacc.vx v20, t4, v6 vwmacc.vx v18, t5, v5 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vsadd.vv \o1, v16, v20 vsadd.vv \o6, v18, v22 vssub.vv v16, v16, v20 vssub.vv v17, v18, v22 vwmul.vx v18, v2, t6 vwmul.vx v20, v2, t6 vwmul.vx v22, v16, t6 vwmul.vx v24, v16, t6 vwmacc.vx v18, t6, v3 vwmacc.vx v22, t6, v17 neg t6, t6 vwmacc.vx v20, t6, v3 vwmacc.vx v24, t6, v17 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vnsra.wi \o3, v18, 12 vnsra.wi \o4, v20, 12 vnsra.wi \o2, v22, 12 vnsra.wi \o5, v24, 12 vmv.v.x v16, zero vssub.vv \o1, v16, \o1 vssub.vv \o3, v16, \o3 vssub.vv \o5, v16, \o5 vssub.vv \o7, v16, \o7 .endm function inv_dct_e16_x8_rvv, export=1, ext=v idct_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 endfunc function inv_adst_e16_x8_rvv, export=1, ext=v iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 endfunc function inv_flipadst_e16_x8_rvv, export=1, ext=v iadst_8 v7, v6, v5, v4, v3, v2, v1, v0 jr t0 endfunc .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct beqz a3, 1f .endif la a5, inv_\txfm2\()_e16_x8_rvv .ifc \txfm1, identity j inv_txfm_identity_add_8x8_rvv .else la a4, inv_\txfm1\()_e16_x8_rvv j inv_txfm_add_8x8_rvv .endif .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma ld t2, (a2) li t1, 2896*8 vmv.v.x v0, t2 vsmul.vx v0, v0, t1 sd x0, (a2) vssra.vi v0, v0, 1 vsmul.vx v0, v0, t1 vssra.vi v0, v0, 4 vmv.v.v v1, v0 vmv.v.v v2, v0 vmv.v.v v3, v0 vmv.v.v v4, v0 vmv.v.v v5, v0 vmv.v.v v6, v0 vmv.v.v v7, v0 j itx_8x8_end .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_4x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) li t1, 2896*8 .irp i, 0, 1, 2, 3 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vsseg4e16.v v0, (a2) vsetivli zero, 4, e16, mf2, ta, ma vmv.v.x v8, zero vle16.v v0, (a2) vse16.v v8, (a2) .irp i, 1, 2, 3, 4, 5, 6, 7 addi a2, a2, 8 vle16.v v\i, (a2) vse16.v v8, (a2) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf4, ta, ma vle8.v v8, (a0) add t0, a0, a1 vle8.v v9, (t0) .irp i, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, mf2, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 vse8.v v8, (a0) .irp i, 9, 10, 11, 12, 13, 14, 15 add a0, a0, a1 vse8.v v\i, (a0) .endr ret endfunc function inv_txfm_add_8x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7 addi t0, t0, 8 vle16.v v\i, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vsseg8e16.v v0, (a2) vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v4, zero vle16.v v0, (a2) vse16.v v4, (a2) .irp i, 1, 2, 3 addi a2, a2, 16 vle16.v v\i, (a2) vse16.v v4, (a2) .endr jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vsetvli zero, zero, e8, mf2, ta, ma vle8.v v4, (a0) add t0, a0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 vse8.v v4, (a0) add a0, a0, a1 vse8.v v5, (a0) add a0, a0, a1 vse8.v v6, (a0) add a0, a0, a1 vse8.v v7, (a0) ret endfunc /* Define symbols added in .if statement */ .equ dct, 1 .equ identity, 2 .equ adst, 3 .equ flipadst, 4 .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) la a4, inv_\txfm1\()_e16_x\w\()w_rvv .else la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) la a5, inv_\txfm2\()_e16_x\h\()w_rvv .else la a5, inv_\txfm2\()_e16_x\h\()_rvv .endif j inv_txfm_add_\w\()x\h\()_rvv endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_identity_e16_x16_rvv, export=1, ext=v li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vsadd.vv v\i, v\i, v\i vsadd.vv v\i, v\i, v16 .endr jr t0 endfunc function inv_dct_e16_x16_rvv, export=1, ext=v idct_8 v0, v2, v4, v6, v8, v10, v12, v14 li t1, 401 li t2, 4076 li t3, 3166 li t4, 2598 vwmul.vx v30, v1, t2 neg t2, t2 vwmul.vx v16, v1, t1 vwmacc.vx v30, t1, v15 vwmacc.vx v16, t2, v15 vwmul.vx v28, v9, t4 neg t4, t4 vwmul.vx v18, v9, t3 vwmacc.vx v28, t3, v7 vwmacc.vx v18, t4, v7 li t1, 1931 li t2, 3612 li t3, 3920 li t4, 1189 vwmul.vx v26, v5, t2 neg t2, t2 vwmul.vx v20, v5, t1 vwmacc.vx v26, t1, v11 vwmacc.vx v20, t2, v11 vwmul.vx v24, v13, t4 neg t4, t4 vwmul.vx v22, v13, t3 vwmacc.vx v24, t3, v3 vwmacc.vx v22, t4, v3 li t1, 2048 li t2, 2896 li t3, 1567 li t4, 3784 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v24, v24, 12 vnsra.wi v26, v26, 12 vnsra.wi v28, v28, 12 vnsra.wi v30, v30, 12 vssub.vv v3, v16, v18 vsadd.vv v16, v16, v18 vssub.vv v5, v22, v20 vsadd.vv v22, v22, v20 vssub.vv v11, v24, v26 vsadd.vv v24, v24, v26 vssub.vv v13, v30, v28 vsadd.vv v30, v30, v28 vwmul.vx v28, v13, t4 neg t4, t4 vwmul.vx v18, v13, t3 vwmul.vx v26, v11, t3 vwmacc.vx v28, t3, v3 neg t3, t3 vwmul.vx v20, v11, t4 vwmacc.vx v18, t4, v3 vwmacc.vx v20, t3, v5 vwmacc.vx v26, t4, v5 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v26, v26, 12 vnsra.wi v28, v28, 12 vssub.vv v5, v18, v20 vsadd.vv v18, v18, v20 vssub.vv v11, v28, v26 vsadd.vv v28, v28, v26 vssub.vv v7, v16, v22 vsadd.vv v16, v16, v22 vssub.vv v9, v30, v24 vsadd.vv v30, v30, v24 vwmul.vx v20, v11, t2 vwmul.vx v22, v9, t2 vwmul.vx v24, v9, t2 vwmul.vx v26, v11, t2 vwmacc.vx v24, t2, v7 vwmacc.vx v26, t2, v5 neg t2, t2 vwmacc.vx v20, t2, v5 vwmacc.vx v22, t2, v7 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v24, v24, 12 vnsra.wi v26, v26, 12 vssub.vv v15, v0, v30 vsadd.vv v0, v0, v30 vssub.vv v17, v2, v28 vsadd.vv v1, v2, v28 vssub.vv v13, v4, v26 vsadd.vv v2, v4, v26 vssub.vv v19, v6, v24 vsadd.vv v3, v6, v24 vssub.vv v11, v8, v22 vsadd.vv v4, v8, v22 vsadd.vv v5, v10, v20 vssub.vv v10, v10, v20 vssub.vv v9, v12, v18 vsadd.vv v6, v12, v18 vssub.vv v8, v14, v16 vsadd.vv v7, v14, v16 vmv.v.v v14, v17 vmv.v.v v12, v19 jr t0 endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 li t1, 4091 li t2, 201 li t3, 3973 li t4, 995 vwmul.vx v16, v15, t1 neg t1, t1 vwmul.vx v18, v15, t2 vwmacc.vx v16, t2, v0 vwmacc.vx v18, t1, v0 vwmul.vx v20, v13, t3 neg t3, t3 vwmul.vx v22, v13, t4 vwmacc.vx v20, t4, v2 vwmacc.vx v22, t3, v2 li t1, 3703 li t2, 1751 li t3, 3290 li t4, 2440 vwmul.vx v24, v11, t1 neg t1, t1 vwmul.vx v26, v11, t2 vwmacc.vx v24, t2, v4 vwmacc.vx v26, t1, v4 vwmul.vx v28, v9, t3 neg t3, t3 vwmul.vx v30, v9, t4 vwmacc.vx v28, t4, v6 vwmacc.vx v30, t3, v6 li t1, 2048 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi v0, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v2, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v4, v24, 12 vnsra.wi v26, v26, 12 vnsra.wi v6, v28, 12 vnsra.wi v30, v30, 12 li t1, 2751 li t2, 3035 li t3, 2106 li t4, 3513 vwmul.vx v16, v7, t1 neg t1, t1 vwmul.vx v20, v7, t2 vwmacc.vx v16, t2, v8 vwmacc.vx v20, t1, v8 vwmul.vx v24, v5, t3 neg t3, t3 vwmul.vx v28, v5, t4 vwmacc.vx v24, t4, v10 vwmacc.vx v28, t3, v10 li t1, 2048 vwadd.wx v16, v16, t1 vwadd.wx v20, v20, t1 vwadd.wx v24, v24, t1 vwadd.wx v28, v28, t1 vnsra.wi v16, v16, 12 vnsra.wi v9, v20, 12 vnsra.wi v24, v24, 12 vnsra.wi v11, v28, 12 vssub.vv v8, v0, v16 vsadd.vv v0, v0, v16 vssub.vv v10, v2, v24 vsadd.vv v2, v2, v24 li t1, 1380 li t2, 3857 li t3, 601 li t4, 4052 vwmul.vx v16, v3, t1 neg t1, t1 vwmul.vx v20, v3, t2 vwmacc.vx v16, t2, v12 vwmacc.vx v20, t1, v12 vwmul.vx v24, v1, t3 neg t3, t3 vwmul.vx v28, v1, t4 vwmacc.vx v24, t4, v14 vwmacc.vx v28, t3, v14 li t1, 2048 vwadd.wx v16, v16, t1 vwadd.wx v20, v20, t1 vwadd.wx v24, v24, t1 vwadd.wx v28, v28, t1 vnsra.wi v16, v16, 12 vnsra.wi v13, v20, 12 vnsra.wi v24, v24, 12 vnsra.wi v15, v28, 12 vssub.vv v12, v4, v16 vsadd.vv v16, v4, v16 vssub.vv v14, v6, v24 vsadd.vv v20, v6, v24 vsadd.vv v1, v18, v9 vssub.vv v9, v18, v9 vsadd.vv v3, v22, v11 vssub.vv v11, v22, v11 vsadd.vv v18, v26, v13 vssub.vv v13, v26, v13 vsadd.vv v22, v30, v15 vssub.vv v15, v30, v15 vssub.vv v4, v0, v16 vsadd.vv v0, v0, v16 vssub.vv v5, v1, v18 vsadd.vv v1, v1, v18 vssub.vv v6, v2, v20 vsadd.vv v2, v2, v20 vssub.vv v7, v3, v22 vsadd.vv v3, v3, v22 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 vwmul.vx v16, v8, t2 vwmul.vx v18, v8, t1 vwmul.vx v20, v10, t4 vwmul.vx v22, v10, t3 vwmul.vx v24, v13, t2 vwmul.vx v26, v13, t1 vwmul.vx v28, v15, t4 vwmul.vx v30, v15, t3 vwmacc.vx v16, t1, v9 neg t1, t1 vwmacc.vx v20, t3, v11 neg t3, t3 vwmacc.vx v26, t2, v12 neg t2, t2 vwmacc.vx v30, t4, v14 neg t4, t4 vwmacc.vx v18, t2, v9 vwmacc.vx v22, t4, v11 vwmacc.vx v24, t1, v12 vwmacc.vx v28, t3, v14 li t1, 2048 li t2, 2896 li t3, 1567 li t4, 3784 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v24, v24, 12 vnsra.wi v26, v26, 12 vnsra.wi v28, v28, 12 vnsra.wi v30, v30, 12 vsadd.vv v8, v16, v24 vsadd.vv v9, v18, v26 vsadd.vv v10, v20, v28 vsadd.vv v11, v22, v30 vssub.vv v12, v16, v24 vssub.vv v13, v18, v26 vssub.vv v14, v20, v28 vssub.vv v15, v22, v30 vwmul.vx v16, v4, t4 vwmul.vx v18, v4, t3 vwmul.vx v20, v7, t4 vwmul.vx v22, v7, t3 vwmul.vx v24, v12, t4 vwmul.vx v26, v12, t3 vwmul.vx v28, v15, t4 vwmul.vx v30, v15, t3 vwmacc.vx v16, t3, v5 vwmacc.vx v22, t4, v6 vwmacc.vx v24, t3, v13 neg t3, t3 vwmacc.vx v30, t4, v14 neg t4, t4 vwmacc.vx v20, t3, v6 vwmacc.vx v28, t3, v14 vwmacc.vx v18, t4, v5 vwmacc.vx v26, t4, v13 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi v16, v16, 12 vnsra.wi v18, v18, 12 vnsra.wi v20, v20, 12 vnsra.wi v22, v22, 12 vnsra.wi v24, v24, 12 vnsra.wi v26, v26, 12 vnsra.wi v28, v28, 12 vnsra.wi v30, v30, 12 .ifc \o0, v0 vsadd.vv \o14, v9, v11 vssub.vv v11, v9, v11 vssub.vv v9, v1, v3 vsadd.vv \o15, v1, v3 vsadd.vv \o1, v8, v10 vssub.vv v10, v8, v10 vssub.vv v8, v0, v2 vsadd.vv \o0, v0, v2 .else vsadd.vv \o1, v8, v10 vssub.vv v10, v8, v10 vssub.vv v8, v0, v2 vsadd.vv \o0, v0, v2 vsadd.vv v2, v9, v11 vssub.vv v11, v9, v11 vssub.vv v9, v1, v3 vsadd.vv \o15, v1, v3 vmv.v.v \o14, v2 .endif vsadd.vv \o3, v16, v20 vssub.vv v6, v16, v20 vsadd.vv \o12, v18, v22 vssub.vv v7, v18, v22 vsadd.vv \o2, v24, v28 vssub.vv v24, v24, v28 vsadd.vv \o13, v26, v30 vssub.vv v26, v26, v30 neg t3, t2 vwmul.vx v28, v24, t2 vwmul.vx v30, v24, t2 vwmacc.vx v28, t2, v26 vwmacc.vx v30, t3, v26 vwmul.vx v24, v10, t2 vwmul.vx v26, v10, t2 vwmacc.vx v24, t2, v11 vwmacc.vx v26, t3, v11 vwmul.vx v20, v6, t2 vwmul.vx v22, v6, t2 vwmacc.vx v20, t2, v7 vwmacc.vx v22, t3, v7 vwmul.vx v16, v8, t2 vwmul.vx v18, v8, t2 vwmacc.vx v16, t2, v9 vwmacc.vx v18, t3, v9 vwadd.wx v16, v16, t1 vwadd.wx v18, v18, t1 vwadd.wx v20, v20, t1 vwadd.wx v22, v22, t1 vwadd.wx v24, v24, t1 vwadd.wx v26, v26, t1 vwadd.wx v28, v28, t1 vwadd.wx v30, v30, t1 vnsra.wi \o7, v16, 12 vnsra.wi \o8, v18, 12 vnsra.wi \o4, v20, 12 vnsra.wi \o11, v22, 12 vnsra.wi \o6, v24, 12 vnsra.wi \o9, v26, 12 vnsra.wi \o5, v28, 12 vnsra.wi \o10, v30, 12 vmv.v.x v16, zero vssub.vv \o1, v16, \o1 vssub.vv \o3, v16, \o3 vssub.vv \o5, v16, \o5 vssub.vv \o7, v16, \o7 vssub.vv \o9, v16, \o9 vssub.vv \o11, v16, \o11 vssub.vv \o13, v16, \o13 vssub.vv \o15, v16, \o15 .endm function inv_adst_e16_x16_rvv, export=1, ext=v iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 jr t0 endfunc function inv_flipadst_e16_x16_rvv, export=1, ext=v iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 jr t0 endfunc .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero vle16.v v0, (t4) vse16.v v16, (t4) .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 vle16.v v\i, (t4) vse16.v v16, (t4) .endr .ifc \variant, _identity li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vsra.vi v16, v16, 1 vaadd.vv v\i, v\i, v16 .endr j L(horz_16x8_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 2 .endr L(horz_16x8_epilog): vsse16.v v0, (t5), t6 .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t5, t5, 2 vsse16.v v\i, (t5), t6 .endr jr a7 .endif endfunc .endm def_horz_16 _identity def_horz_16 function inv_txfm_add_vert_8x16_rvv, export=1, ext=v vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (t4) .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 vle16.v v\i, (t4) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 4 .endr vsetivli zero, 8, e8, mf2, ta, ma vle8.v v16, (t5) add t0, t5, a1 vle8.v v17, (t0) .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v16 vwaddu.wv v1, v1, v17 vwaddu.wv v2, v2, v18 vwaddu.wv v3, v3, v19 vwaddu.wv v4, v4, v20 vwaddu.wv v5, v5, v21 vwaddu.wv v6, v6, v22 vwaddu.wv v7, v7, v23 vwaddu.wv v8, v8, v24 vwaddu.wv v9, v9, v25 vwaddu.wv v10, v10, v26 vwaddu.wv v11, v11, v27 vwaddu.wv v12, v12, v28 vwaddu.wv v13, v13, v29 vwaddu.wv v14, v14, v30 vwaddu.wv v15, v15, v31 vsetvli zero, zero, e16, m1, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v16, v0, 0 vnclipu.wi v17, v1, 0 vnclipu.wi v18, v2, 0 vnclipu.wi v19, v3, 0 vnclipu.wi v20, v4, 0 vnclipu.wi v21, v5, 0 vnclipu.wi v22, v6, 0 vnclipu.wi v23, v7, 0 vnclipu.wi v24, v8, 0 vnclipu.wi v25, v9, 0 vnclipu.wi v26, v10, 0 vnclipu.wi v27, v11, 0 vnclipu.wi v28, v12, 0 vnclipu.wi v29, v13, 0 vnclipu.wi v30, v14, 0 vnclipu.wi v31, v15, 0 vse8.v v16, (t5) .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t5, t5, a1 vse8.v v\i, (t5) .endr jr a7 endfunc function inv_txfm_add_16x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma addi sp, sp, -16*32 .irp i, 8, 0 addi t4, a2, \i*2 addi t5, sp, \i*16*2 .if \i == 8 blt a3, a7, 1f .endif li t6, 16*2 jalr a7, a6 .if \i == 8 j 2f 1: li t1, 64 vsetvli zero, t1, e16, m8, ta, ma vmv.v.x v0, zero vse16.v v0, (t5) addi t5, t5, 128 vse16.v v0, (t5) vsetivli zero, 8, e16, m1, ta, ma 2: .endif .endr .irp i, 0, 8 addi t4, sp, \i*2 addi t5, a0, \i li t6, 16*2 jal a7, inv_txfm_add_vert_8x16_rvv .endr addi sp, sp, 16*32 ret endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v .ifc \txfm1, identity la a6, inv_txfm_horz_identity_16x8_rvv .else la a6, inv_txfm_horz_16x8_rvv la a4, inv_\txfm1\()_e16_x16_rvv .endif la a5, inv_\txfm2\()_e16_x16_rvv li a7, \eob_half j inv_txfm_add_16x16_rvv endfunc .endm def_fn_16x16 dct, dct, 36 def_fn_16x16 identity, identity, 36 def_fn_16x16 dct, adst, 36 def_fn_16x16 dct, flipadst, 36 def_fn_16x16 dct, identity, 8 def_fn_16x16 adst, dct, 36 def_fn_16x16 adst, adst, 36 def_fn_16x16 adst, flipadst, 36 def_fn_16x16 flipadst, dct, 36 def_fn_16x16 flipadst, adst, 36 def_fn_16x16 flipadst, flipadst, 36 def_fn_16x16 identity, dct, 8 .macro def_fn_416_base variant function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma blt a3, a6, 1f addi t0, a2, 16 vle16.v v0, (t0) addi t0, t0, 32 vle16.v v1, (t0) addi t0, t0, 32 vle16.v v2, (t0) addi t0, t0, 32 vle16.v v3, (t0) .ifc \variant, identity_ li t1, (5793-4096)*8 vsmul.vx v8, v0, t1 vaadd.vv v4, v0, v8 vsmul.vx v8, v1, t1 vaadd.vv v5, v1, v8 vsmul.vx v8, v2, t1 vaadd.vv v6, v2, v8 vsmul.vx v8, v3, t1 vaadd.vv v7, v3, v8 .else jalr t0, a4 vssra.vi v4, v0, 1 vssra.vi v5, v1, 1 vssra.vi v6, v2, 1 vssra.vi v7, v3, 1 .endif j 2f 1: .irp i, 4, 5, 6, 7 vmv.v.x v\i, zero .endr 2: vle16.v v0, (a2) addi t0, a2, 32 vle16.v v1, (t0) addi t0, t0, 32 vle16.v v2, (t0) addi t0, t0, 32 vle16.v v3, (t0) .ifc \variant, identity_ li t1, (5793-4096)*8 .irp i, 0, 1, 2, 3 vsmul.vx v8, v\i, t1 vaadd.vv v\i, v\i, v8 .endr j L(itx_4x16_epilog) .else jalr t0, a4 vssra.vi v0, v0, 1 vssra.vi v1, v1, 1 vssra.vi v2, v2, 1 vssra.vi v3, v3, 1 L(itx_4x16_epilog): vsseg4e16.v v0, (a2) addi t0, a2, 64 vsseg4e16.v v4, (t0) vsetivli zero, 4, e16, mf2, ta, ma vmv.v.x v16, zero vle16.v v0, (a2) vse16.v v16, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v16, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 8 vle16.v v\i, (t0) vse16.v v16, (t0) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf4, ta, ma vle8.v v16, (a0) add t0, a0, a1 vle8.v v17, (t0) .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v16 vwaddu.wv v1, v1, v17 vwaddu.wv v2, v2, v18 vwaddu.wv v3, v3, v19 vwaddu.wv v4, v4, v20 vwaddu.wv v5, v5, v21 vwaddu.wv v6, v6, v22 vwaddu.wv v7, v7, v23 vwaddu.wv v8, v8, v24 vwaddu.wv v9, v9, v25 vwaddu.wv v10, v10, v26 vwaddu.wv v11, v11, v27 vwaddu.wv v12, v12, v28 vwaddu.wv v13, v13, v29 vwaddu.wv v14, v14, v30 vwaddu.wv v15, v15, v31 vsetvli zero, zero, e16, mf2, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v16, v0, 0 vnclipu.wi v17, v1, 0 vnclipu.wi v18, v2, 0 vnclipu.wi v19, v3, 0 vnclipu.wi v20, v4, 0 vnclipu.wi v21, v5, 0 vnclipu.wi v22, v6, 0 vnclipu.wi v23, v7, 0 vnclipu.wi v24, v8, 0 vnclipu.wi v25, v9, 0 vnclipu.wi v26, v10, 0 vnclipu.wi v27, v11, 0 vnclipu.wi v28, v12, 0 vnclipu.wi v29, v13, 0 vnclipu.wi v30, v14, 0 vnclipu.wi v31, v15, 0 vse8.v v16, (a0) .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add a0, a0, a1 vse8.v v\i, (a0) .endr ret .endif endfunc function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 8 vle16.v v\i, (t0) .endr .ifc \variant, identity_ li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vssra.vi v16, v16, 1 vsadd.vv v\i, v\i, v16 .endr j L(itx_16x4_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 1 .endr L(itx_16x4_epilog): li t0, 32 vssseg8e16.v v0, (a2), t0 addi t1, a2, 16 vssseg8e16.v v8, (t1), t0 .irp j, 0, 8 vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v4, zero addi t0, a2, \j*2 vle16.v v0, (t0) vse16.v v4, (t0) .irp i, 1, 2, 3 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v4, (t0) .endr jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vsetvli zero, zero, e8, mf2, ta, ma addi t0, a0, \j vle8.v v4, (t0) add t0, t0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 addi t0, a0, \j vse8.v v4, (t0) add t0, t0, a1 vse8.v v5, (t0) add t0, t0, a1 vse8.v v6, (t0) add t0, t0, a1 vse8.v v7, (t0) .endr ret .endif endfunc .endm def_fn_416_base identity_ def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) la a4, inv_\txfm1\()_e16_x\w\()w_rvv .elseif \txfm1 != identity la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) la a5, inv_\txfm2\()_e16_x\h\()w_rvv .else la a5, inv_\txfm2\()_e16_x\h\()_rvv .endif .if \w == 4 li a6, \eob_half .endif .ifc \txfm1, identity j inv_txfm_identity_add_\w\()x\h\()_rvv .else j inv_txfm_add_\w\()x\h\()_rvv .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 .macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma blt a3, a6, 1f vmv.v.x v16, zero addi t0, a2, 16 vle16.v v0, (t0) vse16.v v16, (t0) .irp i, 1, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v16, (t0) .endr li t1, 2896*8 .ifc \variant, identity_ vsmul.vx v8, v0, t1 vsmul.vx v9, v1, t1 vsmul.vx v10, v2, t1 vsmul.vx v11, v3, t1 vsmul.vx v12, v4, t1 vsmul.vx v13, v5, t1 vsmul.vx v14, v6, t1 vsmul.vx v15, v7, t1 .else .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vssra.vi v8, v0, 1 vssra.vi v9, v1, 1 vssra.vi v10, v2, 1 vssra.vi v11, v3, 1 vssra.vi v12, v4, 1 vssra.vi v13, v5, 1 vssra.vi v14, v6, 1 vssra.vi v15, v7, 1 .endif j 2f 1: .irp i, 8, 9, 10, 11, 12, 13, 14, 15 vmv.v.x v\i, zero .endr 2: vmv.v.x v16, zero vle16.v v0, (a2) vse16.v v16, (a2) addi t0, a2, 32 vle16.v v1, (t0) vse16.v v16, (t0) .irp i, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v16, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr .ifc \variant, identity_ j L(itx_8x16_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 1 .endr L(itx_8x16_epilog): addi t4, sp, -8*32 vsseg8e16.v v0, (t4) addi t0, t4, 8*16 vsseg8e16.v v8, (t0) mv t5, a0 li t6, 16 jal a7, inv_txfm_add_vert_8x16_rvv ret .endif endfunc function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 16 vle16.v v\i, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v\i, v\i, t1 .endr .ifc \variant, identity_ li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vssra.vi v16, v16, 1 vsadd.vv v\i, v\i, v16 .endr j L(itx_16x8_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 1 .endr L(itx_16x8_epilog): li t0, 32 vssseg8e16.v v0, (a2), t0 addi t1, a2, 16 vssseg8e16.v v8, (t1), t0 .irp j, 0, 8 vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v8, zero addi t0, a2, \j*2 vle16.v v0, (t0) vse16.v v8, (t0) .irp i, 1, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v8, (t0) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf2, ta, ma addi t0, a0, \j vle8.v v8, (t0) .irp i, 9, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, m1, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 addi t0, a0, \j vse8.v v8, (t0) .irp i, 9, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vse8.v v\i, (t0) .endr .endr ret .endif endfunc .endm def_fn_816_base identity_ def_fn_816_base .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .ifnc \txfm1, identity la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif la a5, inv_\txfm2\()_e16_x\h\()_rvv .if \w == 8 li a6, \eob_half .endif .ifc \txfm1, identity j inv_txfm_identity_add_\w\()x\h\()_rvv .else j inv_txfm_add_\w\()x\h\()_rvv .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43 def_fn_816 \w, \h, identity, identity, 43 def_fn_816 \w, \h, dct, adst, 43 def_fn_816 \w, \h, dct, flipadst, 43 def_fn_816 \w, \h, dct, identity, 8 def_fn_816 \w, \h, adst, dct, 43 def_fn_816 \w, \h, adst, adst, 43 def_fn_816 \w, \h, adst, flipadst, 43 def_fn_816 \w, \h, flipadst, dct, 43 def_fn_816 \w, \h, flipadst, adst, 43 def_fn_816 \w, \h, flipadst, flipadst, 43 def_fn_816 \w, \h, identity, dct, 64 def_fn_816 \w, \h, adst, identity, 8 def_fn_816 \w, \h, flipadst, identity, 8 def_fn_816 \w, \h, identity, adst, 64 def_fn_816 \w, \h, identity, flipadst, 64 .endm def_fns_816 8, 16 def_fns_816 16, 8