/* * Copyright © 2023, VideoLAN and dav1d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" .macro FILTER_W4 DIR, TYPE .ifc \DIR, h addi.d t5, a0, -2 fld.s f6, t5, 0 //p1 p0 q0 q1 fldx.s f7, t5, a1 alsl.d t5, a1, t5, 1 fld.s f8, t5, 0 fldx.s f9, t5, a1 vilvl.b vr6, vr7, vr6 vilvl.b vr7, vr9, vr8 vilvl.h vr6, vr7, vr6 //p1p1p1p1 vbsrl.v vr7, vr6, 4 //p0p0p0p0 vbsrl.v vr8, vr7, 4 //q0q0q0q0 vbsrl.v vr9, vr8, 4 //q1q1q1q1 .else sub.d t5, a0, a1 fld.s f7, t5, 0 sub.d t5, t5, a1 fld.s f6, t5, 0 fld.s f8, a0, 0 fldx.s f9, a0, a1 .endif vabsd.bu vr10, vr6, vr7 // (p1 - p0) vabsd.bu vr11, vr9, vr8 // (q1 - q0) vabsd.bu vr12, vr7, vr8 // (p0 - q0) vabsd.bu vr13, vr6, vr9 // (p1 - q1) vmax.bu vr14, vr10, vr11 vsle.bu vr15, vr14, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I vsadd.bu vr16, vr12, vr12 vsrli.b vr17, vr13, 1 vsadd.bu vr16, vr16, vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) vsle.bu vr16, vr16, vr3 vand.v vr20, vr15, vr16 //fm vpickve2gr.wu t5, vr20, 0 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W4 vslt.bu vr16, vr2, vr14 //hev vsllwil.h.b vr30, vr20, 0 //expand fm to w vsllwil.w.h vr30, vr30, 0 vsllwil.hu.bu vr17, vr6, 0 vsllwil.hu.bu vr18, vr9, 0 vsub.h vr17, vr17, vr18 vssrarni.b.h vr17, vr17, 0 //f = iclip_diff(p1 - q1) vand.v vr17, vr17, vr16 vsllwil.h.b vr18, vr17, 0 vsllwil.hu.bu vr10, vr8, 0 vsllwil.hu.bu vr11, vr7, 0 vsub.h vr10, vr10, vr11 vsadd.h vr11, vr10, vr10 vsadd.h vr10, vr10, vr11 //3 * (q0 - p0) vsadd.h vr10, vr10, vr18 //f = iclip_diff(3 * (q0 - p0) + f); vssrani.b.h vr10, vr10, 0 vsllwil.h.b vr10, vr10, 0 vaddi.hu vr11, vr10, 4 vaddi.hu vr12, vr10, 3 li.w t5, 127 vreplgr2vr.h vr13, t5 vmin.h vr11, vr11, vr13 vmin.h vr12, vr12, vr13 vsrai.h vr11, vr11, 3 //f1 vsrai.h vr12, vr12, 3 //f2 vsllwil.hu.bu vr13, vr7, 0 //p0 vsllwil.hu.bu vr14, vr8, 0 //q0 vsadd.h vr13, vr13, vr12 vssub.h vr14, vr14, vr11 vssrani.bu.h vr13, vr13, 0 //dst-1 vssrani.bu.h vr14, vr14, 0 //dst+0 vsrari.h vr15, vr11, 1 //f vsllwil.hu.bu vr18, vr6, 0 //p1 vsllwil.hu.bu vr19, vr9, 0 //q1 vsadd.h vr18, vr18, vr15 vssub.h vr19, vr19, vr15 vssrani.bu.h vr18, vr18, 0 //dst-2 vssrani.bu.h vr19, vr19, 0 //dst+1 vbitsel.v vr26, vr18, vr6, vr16 vbitsel.v vr29, vr19, vr9, vr16 vbitsel.v vr6, vr6, vr26, vr20 vbitsel.v vr7, vr7, vr13, vr20 vbitsel.v vr8, vr8, vr14, vr20 vbitsel.v vr9, vr9, vr29, vr20 .ifc \DIR, h vilvl.b vr6, vr7, vr6 vilvl.b vr9, vr9, vr8 vilvl.h vr6, vr9, vr6 addi.d t5, a0, -2 vstelm.w vr6, t5, 0, 0 add.d t5, t5, a1 vstelm.w vr6, t5, 0, 1 add.d t5, t5, a1 vstelm.w vr6, t5, 0, 2 add.d t5, t5, a1 vstelm.w vr6, t5, 0, 3 .else fst.s f8, a0, 0 fstx.s f9, a0, a1 sub.d t5, a0, a1 fst.s f7, t5, 0 sub.d t5, t5, a1 fst.s f6, t5, 0 .endif .END_FILTER_\DIR\()\TYPE\()_W4: .endm .macro FILTER_W6 DIR, TYPE .ifc \DIR, h addi.d t5, a0, -3 fld.d f6, t5, 0 //p2 p1 p0 q0 q1 q2 fldx.d f7, t5, a1 alsl.d t5, a1, t5, 1 fld.d f8, t5, 0 fldx.d f9, t5, a1 vilvl.b vr6, vr7, vr6 vilvl.b vr7, vr9, vr8 vilvh.h vr10, vr7, vr6 vilvl.h vr6, vr7, vr6 vbsrl.v vr7, vr6, 4 //p1 vbsrl.v vr8, vr7, 4 //p0 vbsrl.v vr9, vr8, 4 //q0 vbsrl.v vr11, vr10, 4 //q2 .else alsl.d t5, a1, a1, 1 sub.d t5, a0, t5 fld.d f6, t5, 0 fldx.d f7, t5, a1 alsl.d t5, a1, t5, 1 fld.d f8, t5, 0 fldx.d f9, t5, a1 alsl.d t5, a1, t5, 1 fld.d f10, t5, 0 fldx.d f11, t5, a1 .endif vabsd.bu vr12, vr7, vr8 //abs(p1-p0) vabsd.bu vr13, vr10, vr9 //abs(q1-q0) vmax.bu vr14, vr12, vr13 vslt.bu vr2, vr2, vr14 //hev vabsd.bu vr12, vr6, vr7 //abs(p2-p1) vmax.bu vr12, vr12, vr14 vabsd.bu vr13, vr11, vr10 //abs(q2-q1) vmax.bu vr12, vr12, vr13 vsle.bu vr0, vr12, vr4 // <=I vabsd.bu vr13, vr8, vr9 //abs(p0-q0) vsadd.bu vr13, vr13, vr13 vabsd.bu vr15, vr7, vr10 vsrli.b vr15, vr15, 1 vsadd.bu vr13, vr13, vr15 vsle.bu vr13, vr13, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E vand.v vr0, vr0, vr13 //fm vpickve2gr.wu t5, vr0, 0 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W6 vabsd.bu vr12, vr6, vr8 //abs(p2-p0) vabsd.bu vr13, vr11, vr9 //abs(q2-q0) vmax.bu vr12, vr12, vr14 vmax.bu vr12, vr12, vr13 vxor.v vr13, vr13, vr13 vaddi.bu vr13, vr13, 1 vsle.bu vr1, vr12, vr13 //flat8in //6789 10 11 --expand to h vsllwil.hu.bu vr12, vr6, 0 vsllwil.hu.bu vr13, vr7, 0 vsllwil.hu.bu vr14, vr8, 0 vsllwil.hu.bu vr15, vr9, 0 vsllwil.hu.bu vr16, vr10, 0 vsllwil.hu.bu vr17, vr11, 0 //dst-2 vsadd.hu vr18, vr12, vr12 vsadd.hu vr18, vr18, vr12 vsadd.hu vr18, vr18, vr13 vsadd.hu vr18, vr18, vr13 vsadd.hu vr18, vr18, vr14 vsadd.hu vr18, vr18, vr14 vsadd.hu vr18, vr18, vr15 //dst-1 vsadd.hu vr19, vr18, vr15 vsadd.hu vr19, vr19, vr16 vssub.hu vr19, vr19, vr12 vssub.hu vr19, vr19, vr12 //dst+0 vsadd.hu vr20, vr19, vr17 vsadd.hu vr20, vr20, vr16 vssub.hu vr20, vr20, vr12 vssub.hu vr20, vr20, vr13 //dst+1 vsadd.hu vr21, vr20, vr17 vsadd.hu vr21, vr21, vr17 vssub.hu vr21, vr21, vr13 vssub.hu vr21, vr21, vr14 vsrari.h vr18, vr18, 3 vsrari.h vr19, vr19, 3 vsrari.h vr20, vr20, 3 vsrari.h vr21, vr21, 3 vsub.h vr22, vr13, vr16 vssrani.b.h vr22, vr22, 0 vand.v vr22, vr22, vr2 vsllwil.h.b vr22, vr22, 0 //f = iclip_diff(p1 - q1); vsub.h vr23, vr15, vr14 vsadd.h vr24, vr23, vr23 vsadd.h vr23, vr23, vr24 vsadd.h vr23, vr23, vr22 vssrani.b.h vr23, vr23, 0 vsllwil.h.b vr23, vr23, 0 //f = iclip_diff(3 * (q0 - p0) + f); vaddi.hu vr24, vr23, 4 vaddi.hu vr25, vr23, 3 li.w t5, 127 vreplgr2vr.h vr3, t5 vmin.h vr24, vr24, vr3 vmin.h vr25, vr25, vr3 vsrai.h vr24, vr24, 3 //f1 vsrai.h vr25, vr25, 3 //f2 vsadd.h vr26, vr14, vr25 //dst-1 vssub.h vr27, vr15, vr24 //dst+0 vsrari.h vr24, vr24, 1 vsadd.h vr28, vr13, vr24 vssub.h vr29, vr16, vr24 vsllwil.h.b vr2, vr2, 0 vbitsel.v vr28, vr28, vr13, vr2 //dst-2 vbitsel.v vr29, vr29, vr16, vr2 //dst+1 //flat8in vsllwil.h.b vr1, vr1, 0 vbitsel.v vr18, vr28, vr18, vr1 vbitsel.v vr19, vr26, vr19, vr1 vbitsel.v vr20, vr27, vr20, vr1 vbitsel.v vr21, vr29, vr21, vr1 vssrani.bu.h vr18, vr18, 0 vssrani.bu.h vr19, vr19, 0 vssrani.bu.h vr20, vr20, 0 vssrani.bu.h vr21, vr21, 0 vbitsel.v vr7, vr7, vr18, vr0 //p1 vbitsel.v vr8, vr8, vr19, vr0 //p0 vbitsel.v vr9, vr9, vr20, vr0 //q0 vbitsel.v vr10, vr10, vr21, vr0 //q1 .ifc \DIR, h vilvl.b vr7, vr8, vr7 vilvl.b vr9, vr10, vr9 vilvl.h vr7, vr9, vr7 addi.d t5, a0, -2 vstelm.w vr7, t5, 0, 0 add.d t5, t5, a1 vstelm.w vr7, t5, 0, 1 add.d t5, t5, a1 vstelm.w vr7, t5, 0, 2 add.d t5, t5, a1 vstelm.w vr7, t5, 0, 3 .else fst.s f9, a0, 0 fstx.s f10, a0, a1 sub.d t5, a0, a1 fst.s f8, t5, 0 sub.d t5, t5, a1 fst.s f7, t5, 0 .endif .END_FILTER_\DIR\()\TYPE\()_W6: .endm .macro FILTER_W8 DIR, TYPE .ifc \DIR, h addi.d t5, a0, -4 fld.d f6, t5, 0 //p3 p2 p1 p0 q0 q1 q2 q3 fldx.d f7, t5, a1 alsl.d t5, a1, t5, 1 fld.d f8, t5, 0 fldx.d f9, t5, a1 vilvl.b vr6, vr7, vr6 vilvl.b vr7, vr9, vr8 vilvh.h vr10, vr7, vr6 //q0 vilvl.h vr6, vr7, vr6 //p3 vbsrl.v vr7, vr6, 4 //p2 vbsrl.v vr8, vr6, 8 //p1 vbsrl.v vr9, vr6, 12 //p0 vbsrl.v vr11, vr10, 4 //q1 vbsrl.v vr12, vr10, 8 //q2 vbsrl.v vr13, vr10, 12 //q3 .else fld.s f10, a0, 0 fldx.s f11, a0, a1 add.d t5, a0, a1 fldx.s f12, t5, a1 add.d t5, t5, a1 fldx.s f13, t5, a1 sub.d t5, a0, a1 fld.s f9, t5, 0 sub.d t5, t5, a1 fld.s f8, t5, 0 sub.d t5, t5, a1 fld.s f7, t5, 0 sub.d t5, t5, a1 fld.s f6, t5, 0 .endif vabsd.bu vr14, vr8, vr9 //p1-p0 vabsd.bu vr15, vr11, vr10 //q1-q0 vabsd.bu vr16, vr9, vr10 //p0-q0 vabsd.bu vr17, vr8, vr11 //p1-q1 vabsd.bu vr18, vr7, vr8 //p2-p1 vabsd.bu vr19, vr12, vr11 //q2-q1 vabsd.bu vr20, vr6, vr7 //p3-p2 vabsd.bu vr21, vr13, vr12 //q3-q2 vmax.bu vr22, vr14, vr15 vsle.bu vr23, vr22, vr4 //abs(p1 - p0) <= I && abs(q1 - q0) <= I vsadd.bu vr16, vr16, vr16 vsrli.b vr17, vr17, 1 vsadd.bu vr16, vr16, vr17 vsle.bu vr16, vr16, vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E vand.v vr16, vr16, vr23 //fm vpickve2gr.wu t5, vr16, 0 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W8 vmax.bu vr23, vr18, vr19 vmax.bu vr23, vr23, vr20 vmax.bu vr23, vr23, vr21 vsle.bu vr23, vr23, vr4 vand.v vr16, vr16, vr23 //fm vabsd.bu vr17, vr7, vr9 //abs(p2-p0) vabsd.bu vr18, vr12, vr10 //abs(q2-q0) vmax.bu vr17, vr17, vr14 vmax.bu vr17, vr17, vr15 vmax.bu vr17, vr17, vr18 vabsd.bu vr18, vr6, vr9 //abs(p3 - p0) vabsd.bu vr19, vr13, vr10 //abs(q3 - q0) vmax.bu vr17, vr17, vr18 vmax.bu vr17, vr17, vr19 vxor.v vr5, vr5, vr5 vaddi.bu vr5, vr5, 1 //F vsle.bu vr17, vr17, vr5 //flat8in vsllwil.hu.bu vr0, vr6, 0 //p3 vsllwil.hu.bu vr1, vr7, 0 //p2 vsllwil.hu.bu vr27, vr8, 0 //p1 vsllwil.hu.bu vr3, vr9, 0 //p0 vsllwil.hu.bu vr4, vr10, 0 //q0 vsllwil.hu.bu vr5, vr11, 0 //q1 vsllwil.hu.bu vr14, vr12, 0 //q2 vsllwil.hu.bu vr15, vr13, 0 //q3 vsadd.hu vr18, vr0, vr0 //p3+p3 vsadd.hu vr19, vr15, vr15 //q3+q3 vsadd.hu vr20, vr0, vr1 //p3+p2 vsadd.hu vr21, vr1, vr27 //p2+p1 vsadd.hu vr28, vr27, vr3 //p1+p0 vsadd.hu vr23, vr3, vr4 //p0+q0 vsadd.hu vr24, vr4, vr5 //q0+q1 vsadd.hu vr25, vr5, vr14 //q1+q2 vsadd.hu vr26, vr14, vr15 //q2+q3 // dst-3 vsadd.hu vr29, vr18, vr20 vsadd.hu vr29, vr29, vr21 vsadd.hu vr29, vr29, vr23 // dst-2 vsadd.hu vr30, vr18, vr21 vsadd.hu vr30, vr30, vr28 vsadd.hu vr30, vr30, vr24 // dst-1 vsadd.hu vr31, vr20, vr28 vsadd.hu vr31, vr31, vr23 vsadd.hu vr31, vr31, vr25 // dst+0 vsadd.hu vr18, vr21, vr23 vsadd.hu vr18, vr18, vr24 vsadd.hu vr18, vr18, vr26 //dst+1 vsadd.hu vr20, vr28, vr24 vsadd.hu vr20, vr20, vr25 vsadd.hu vr20, vr20, vr19 //dst+2 vsadd.hu vr21, vr23, vr25 vsadd.hu vr21, vr21, vr26 vsadd.hu vr21, vr21, vr19 vssrarni.bu.h vr23, vr29, 3 vssrarni.bu.h vr24, vr30, 3 vssrarni.bu.h vr25, vr31, 3 vssrarni.bu.h vr19, vr18, 3 vssrarni.bu.h vr20, vr20, 3 vssrarni.bu.h vr21, vr21, 3 // !flat8in vslt.bu vr2, vr2, vr22 //hev vsub.h vr30, vr27, vr5 //p1-q1 vssrani.b.h vr30, vr30, 0 vand.v vr30, vr30, vr2 vsllwil.h.b vr30, vr30, 0 vsub.h vr31, vr4, vr3 vsadd.h vr0, vr31, vr31 vsadd.h vr31, vr31, vr0 vsadd.h vr31, vr31, vr30 vssrani.b.h vr31, vr31, 0 vsllwil.h.b vr31, vr31, 0 //f = iclip_diff(3 * (q0 - p0) + f); vaddi.hu vr14, vr31, 4 vaddi.hu vr15, vr31, 3 li.w t5, 127 vreplgr2vr.h vr18, t5 vmin.h vr14, vr14, vr18 vmin.h vr15, vr15, vr18 vsrai.h vr14, vr14, 3 //f1 vsrai.h vr15, vr15, 3 //f2 vsadd.h vr3, vr3, vr15 vssub.h vr4, vr4, vr14 vssrani.bu.h vr3, vr3, 0 //dst-1 vssrani.bu.h vr4, vr4, 0 //dst+0 vsrari.h vr14, vr14, 1 vsadd.h vr18, vr27, vr14 vssub.h vr26, vr5, vr14 vssrani.bu.h vr18, vr18, 0 //dst-2 vssrani.bu.h vr26, vr26, 0 //dst+1 vbitsel.v vr27, vr18, vr8, vr2 //dst-2 vbitsel.v vr28, vr26, vr11, vr2 //dst+1 vbitsel.v vr23, vr7, vr23, vr17 //dst-3 (p2) vbitsel.v vr24, vr27, vr24, vr17 //dst-2 vbitsel.v vr25, vr3, vr25, vr17 //dst-1 vbitsel.v vr19, vr4, vr19, vr17 //dst+0 vbitsel.v vr20, vr28, vr20, vr17 //dst+1 vbitsel.v vr21, vr12, vr21, vr17 //dst+2 vbitsel.v vr7, vr7, vr23, vr16 //-3 vbitsel.v vr8, vr8, vr24, vr16 //-2 vbitsel.v vr9, vr9, vr25, vr16 //-1 vbitsel.v vr10, vr10, vr19, vr16 //+0 vbitsel.v vr11, vr11, vr20, vr16 //+1 vbitsel.v vr12, vr12, vr21, vr16 //+2 .ifc \DIR, h vilvl.b vr6, vr7, vr6 vilvl.b vr8, vr9, vr8 vilvl.b vr10, vr11, vr10 vilvl.b vr12, vr13, vr12 vilvl.h vr6, vr8, vr6 //p3p2p1p0 -- -- -- vilvl.h vr10, vr12, vr10 //q0q1q2q3 -- -- -- vilvl.w vr0, vr10, vr6 //p3p2p1p0q0q1q2q3 -- vilvh.w vr1, vr10, vr6 //-- addi.d t5, a0, -4 vstelm.d vr0, t5, 0, 0 add.d t5, t5, a1 vstelm.d vr0, t5, 0, 1 add.d t5, t5, a1 vstelm.d vr1, t5, 0, 0 add.d t5, t5, a1 vstelm.d vr1, t5, 0, 1 .else alsl.d t5, a1, a1, 1 sub.d t5, a0, t5 fst.s f7, t5, 0 fstx.s f8, t5, a1 add.d t5, t5, a1 fstx.s f9, t5, a1 fst.s f10, a0, 0 add.d t5, a0, a1 fst.s f11, t5, 0 fstx.s f12, t5, a1 .endif .END_FILTER_\DIR\()\TYPE\()_W8: .endm .macro FILTER_W16 DIR, TYPE .ifc \DIR, h addi.d t5, a0, -7 vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 vldx vr7, t5, a1 add.d t5, t5, a1 vldx vr8, t5, a1 add.d t5, t5, a1 vldx vr9, t5, a1 vilvl.b vr10, vr7, vr6 vilvh.b vr11, vr7, vr6 vilvl.b vr12, vr9, vr8 vilvh.b vr13, vr9, vr8 vilvl.h vr6, vr12, vr10 vilvh.h vr10, vr12, vr10 //p2--- vilvl.h vr15, vr13, vr11 //q1--- vilvh.h vr19, vr13, vr11 vbsrl.v vr7, vr6, 4 //p5--- vbsrl.v vr8, vr6, 8 //p4--- vbsrl.v vr9, vr6, 12 //p3--- vbsrl.v vr12, vr10, 4 //p1--- vbsrl.v vr13, vr10, 8 //p0--- vbsrl.v vr14, vr10, 12 //q0--- vbsrl.v vr16, vr15, 4 //q2--- vbsrl.v vr17, vr15, 8 //q3--- vbsrl.v vr18, vr15, 12 //q4--- vbsrl.v vr20, vr19, 4 //q6--- .else slli.d t5, a1, 3 sub.d t5, a0, t5 fldx.s f6, t5, a1 //p6 alsl.d t5, a1, t5, 1 fld.s f7, t5, 0 //p5 fldx.s f8, t5, a1 //p4 alsl.d t5, a1, t5, 1 fld.s f9, t5, 0 //p3 fldx.s f10, t5, a1 //p2 alsl.d t5, a1, t5, 1 fld.s f12, t5, 0 //p1 fldx.s f13, t5, a1 //p0 alsl.d t5, a1, t5, 1 fld.s f14, t5, 0 //q0 fldx.s f15, t5, a1 //q1 alsl.d t5, a1, t5, 1 fld.s f16, t5, 0 //q2 fldx.s f17, t5, a1 //q3 alsl.d t5, a1, t5, 1 fld.s f18, t5, 0 //q4 fldx.s f19, t5, a1 //q5 add.d t5, t5, a1 fldx.s f20, t5, a1 //q6 //temp store addi.d sp, sp, -96 fst.d f7, sp, 0 fst.d f8, sp, 8 fst.d f9, sp, 16 fst.d f10, sp, 24 fst.d f12, sp, 32 fst.d f13, sp, 40 fst.d f14, sp, 48 fst.d f15, sp, 56 fst.d f16, sp, 64 fst.d f17, sp, 72 fst.d f18, sp, 80 fst.d f19, sp, 88 .endif vabsd.bu vr21, vr12, vr13 //abs(p1-p0) vabsd.bu vr22, vr15, vr14 //abs(q1-q0) vmax.bu vr0, vr21, vr22 vslt.bu vr2, vr2, vr0 //hev vabsd.bu vr1, vr10, vr12 //abs(p2-p1) vmax.bu vr0, vr0, vr1 vabsd.bu vr1, vr16, vr15 //abs(q2-q1) vmax.bu vr0, vr0, vr1 vabsd.bu vr1, vr9, vr10 //abs(p3-p2) vmax.bu vr0, vr0, vr1 vabsd.bu vr1, vr17, vr16 //abs(q3-q2) vmax.bu vr0, vr0, vr1 vsle.bu vr0, vr0, vr4 //vr4 released I vabsd.bu vr1, vr13, vr14 //abs(p0-q0) vsadd.bu vr1, vr1, vr1 vabsd.bu vr4, vr12, vr15 //abs(p1-q1) vsrli.b vr4, vr4, 1 vsadd.bu vr1, vr1, vr4 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) vsle.bu vr1, vr1, vr3 //vr3 released E vand.v vr0, vr0, vr1 //fm vpickve2gr.wu t5, vr0, 0 beqz t5, .END_FILTER_\DIR\()\TYPE\()_W16 vabsd.bu vr1, vr6, vr13 //abs(p6-p0) vabsd.bu vr4, vr7, vr13 //abs(p5-p0) vmax.bu vr1, vr1, vr4 vabsd.bu vr4, vr8, vr13 //abs(p4-p0) vmax.bu vr1, vr1, vr4 vabsd.bu vr4, vr18, vr14 //abs(q4-q0) vmax.bu vr1, vr1, vr4 vabsd.bu vr4, vr19, vr14 //abs(q5-q0) vmax.bu vr1, vr1, vr4 vabsd.bu vr4, vr20, vr14 vmax.bu vr1, vr1, vr4 vxor.v vr5, vr5, vr5 vaddi.bu vr5, vr5, 1 //F vsle.bu vr1, vr1, vr5 //flat8out vabsd.bu vr3, vr10, vr13 //abs(p2-p0) vmax.bu vr3, vr3, vr21 vmax.bu vr3, vr3, vr22 vabsd.bu vr4, vr16, vr14 //abs(q2-q0) vmax.bu vr3, vr3, vr4 vabsd.bu vr4, vr9, vr13 //abs(p3-p0) vmax.bu vr3, vr3, vr4 vabsd.bu vr4, vr17, vr14 //abs(q3-q0) vmax.bu vr3, vr3, vr4 vsle.bu vr3, vr3, vr5 //flatin released vr5 vsllwil.hu.bu vr6, vr6, 0 //p6 vsllwil.hu.bu vr7, vr7, 0 //p5 vsllwil.hu.bu vr8, vr8, 0 //p4 vsllwil.hu.bu vr9, vr9, 0 //p3 vsllwil.hu.bu vr10, vr10, 0 //p2 vsllwil.hu.bu vr12, vr12, 0 //p1 vsllwil.hu.bu vr13, vr13, 0 //p0 vsllwil.hu.bu vr14, vr14, 0 //q0 vsllwil.hu.bu vr15, vr15, 0 //q1 vsllwil.hu.bu vr16, vr16, 0 //q2 vsllwil.hu.bu vr17, vr17, 0 //q3 vsllwil.hu.bu vr18, vr18, 0 //q4 vsllwil.hu.bu vr19, vr19, 0 //q5 vsllwil.hu.bu vr20, vr20, 0 //q6 //dst-6 vslli.w vr21, vr6, 3 vssub.hu vr21, vr21, vr6 vsadd.hu vr21, vr21, vr7 vsadd.hu vr21, vr21, vr7 vsadd.hu vr21, vr21, vr8 vsadd.hu vr21, vr21, vr8 vsadd.hu vr21, vr21, vr9 vsadd.hu vr21, vr21, vr10 vsadd.hu vr21, vr21, vr12 vsadd.hu vr21, vr21, vr13 vsadd.hu vr21, vr21, vr14 //dst-5 vsadd.hu vr22, vr21, vr15 vsadd.hu vr22, vr22, vr9 vssub.hu vr22, vr22, vr6 vssub.hu vr22, vr22, vr6 //dst-4 vsadd.hu vr23, vr22, vr16 vsadd.hu vr23, vr23, vr10 vssub.hu vr23, vr23, vr7 vssub.hu vr23, vr23, vr6 //dst-3 vsadd.hu vr24, vr23, vr12 vsadd.hu vr24, vr24, vr17 vssub.hu vr24, vr24, vr6 vssub.hu vr24, vr24, vr8 //dst-2 vsadd.hu vr25, vr24, vr18 vsadd.hu vr25, vr25, vr13 vssub.hu vr25, vr25, vr6 vssub.hu vr25, vr25, vr9 //dst-1 vsadd.hu vr26, vr25, vr19 vsadd.hu vr26, vr26, vr14 vssub.hu vr26, vr26, vr6 vssub.hu vr26, vr26, vr10 //dst+0 vsadd.hu vr27, vr26, vr20 vsadd.hu vr27, vr27, vr15 vssub.hu vr27, vr27, vr6 vssub.hu vr27, vr27, vr12 //dst+1 vsadd.hu vr28, vr27, vr20 vsadd.hu vr28, vr28, vr16 vssub.hu vr28, vr28, vr7 vssub.hu vr28, vr28, vr13 //dst+2 vsadd.hu vr29, vr28, vr20 vsadd.hu vr29, vr29, vr17 vssub.hu vr29, vr29, vr8 vssub.hu vr29, vr29, vr14 //dst+3 vsadd.hu vr30, vr29, vr20 vsadd.hu vr30, vr30, vr18 vssub.hu vr30, vr30, vr9 vssub.hu vr30, vr30, vr15 //dst+4 vsadd.hu vr31, vr30, vr20 vsadd.hu vr31, vr31, vr19 vssub.hu vr31, vr31, vr10 vssub.hu vr31, vr31, vr16 //dst+5 vsadd.hu vr11, vr31, vr20 vsadd.hu vr11, vr11, vr20 vssub.hu vr11, vr11, vr12 vssub.hu vr11, vr11, vr17 vsrari.h vr21, vr21, 4 vsrari.h vr22, vr22, 4 vsrari.h vr23, vr23, 4 vsrari.h vr24, vr24, 4 vsrari.h vr25, vr25, 4 vsrari.h vr26, vr26, 4 vsrari.h vr27, vr27, 4 vsrari.h vr28, vr28, 4 vsrari.h vr29, vr29, 4 vsrari.h vr30, vr30, 4 vsrari.h vr31, vr31, 4 vsrari.h vr11, vr11, 4 vand.v vr1, vr1, vr3 vsllwil.h.b vr1, vr1, 0 //expand to h //(flat8out & flat8in) vbitsel.v vr21, vr7, vr21, vr1 //dst-6 vbitsel.v vr22, vr8, vr22, vr1 //dst-5 vbitsel.v vr23, vr9, vr23, vr1 //dst-4 vbitsel.v vr30, vr17, vr30, vr1 //dst+3 vbitsel.v vr31, vr18, vr31, vr1 //dst+4 vbitsel.v vr11, vr19, vr11, vr1 //dst+5 //flat8in //dst-3 vslli.h vr4, vr9, 1 vsadd.hu vr4, vr4, vr9 //p3*3 vsadd.hu vr4, vr4, vr10 vsadd.hu vr4, vr4, vr10 vsadd.hu vr4, vr4, vr12 vsadd.hu vr4, vr4, vr13 vsadd.hu vr4, vr4, vr14 //dst-2 vsadd.hu vr5, vr4, vr12 vsadd.hu vr5, vr5, vr15 vssub.hu vr5, vr5, vr9 vssub.hu vr5, vr5, vr10 //dst-1 vsadd.hu vr18, vr5, vr13 vsadd.hu vr18, vr18, vr16 vssub.hu vr18, vr18, vr9 vssub.hu vr18, vr18, vr12 //dst+0 vsadd.hu vr7, vr18, vr14 vsadd.hu vr7, vr7, vr17 vssub.hu vr7, vr7, vr9 vssub.hu vr7, vr7, vr13 //dst+1 vsadd.hu vr8, vr7, vr15 vsadd.hu vr8, vr8, vr17 vssub.hu vr8, vr8, vr10 vssub.hu vr8, vr8, vr14 //dst+2 vsadd.hu vr9, vr8, vr16 vsadd.hu vr9, vr9, vr17 vssub.hu vr9, vr9, vr12 vssub.hu vr9, vr9, vr15 vsrari.h vr4, vr4, 3 vsrari.h vr5, vr5, 3 vsrari.h vr18, vr18, 3 vsrari.h vr7, vr7, 3 vsrari.h vr8, vr8, 3 vsrari.h vr9, vr9, 3 //flat8out & flat8in vbitsel.v vr24, vr4, vr24, vr1 //dst-3 vbitsel.v vr25, vr5, vr25, vr1 //dst-2 vbitsel.v vr26, vr18, vr26, vr1 //dst-1 vbitsel.v vr27, vr7, vr27, vr1 //dst+0 vbitsel.v vr28, vr8, vr28, vr1 //dst+1 vbitsel.v vr29, vr9, vr29, vr1 //dst+2 //!flat8in vsub.h vr17, vr12, vr15 //p1-q1 vsllwil.h.b vr2, vr2, 0 vand.v vr17, vr17, vr2 //&hev vssrani.b.h vr17, vr17, 0 vsllwil.h.b vr17, vr17, 0 vsub.h vr7, vr14, vr13 vsadd.h vr8, vr7, vr7 vsadd.h vr7, vr7, vr8 vsadd.h vr7, vr7, vr17 vssrani.b.h vr7, vr7, 0 vsllwil.h.b vr17, vr7, 0 //f = iclip_diff(3 * (q0 - p0) + f); vaddi.hu vr7, vr17, 4 vaddi.hu vr8, vr17, 3 li.w t5, 127 vreplgr2vr.h vr9, t5 vmin.h vr7, vr7, vr9 vmin.h vr8, vr8, vr9 vsrai.h vr7, vr7, 3 //f1 vsrai.h vr8, vr8, 3 //f2 vsadd.h vr4, vr13, vr8 //dst-1 vssub.h vr5, vr14, vr7 //dst+0 vsrari.h vr7, vr7, 1 vsadd.h vr17, vr12, vr7 vssub.h vr7, vr15, vr7 vbitsel.v vr17, vr17, vr12, vr2 //dst-2 vbitsel.v vr7, vr7, vr15, vr2 //dst+1 //flat8in or !flat8in vsllwil.h.b vr3, vr3, 0 vbitsel.v vr24, vr10, vr24, vr3 //dst-3 vbitsel.v vr25, vr17, vr25, vr3 //dst-2 vbitsel.v vr26, vr4, vr26, vr3 //dst-1 vbitsel.v vr27, vr5, vr27, vr3 //dst+0 vbitsel.v vr28, vr7, vr28, vr3 //dst+1 vbitsel.v vr29, vr16, vr29, vr3 //dst+2 .ifc \DIR, h //dst-6,dst-2,dst-5,dst-1 vssrani.bu.h vr25, vr21, 0 vssrani.bu.h vr26, vr22, 0 vpermi.w vr25, vr25, 0xd8 vpermi.w vr26, vr26, 0xd8 vilvl.b vr6, vr26, vr25 //65656565 21212121 //dst-4,dst+0,dst-3,dst+1 vssrani.bu.h vr27, vr23, 0 vssrani.bu.h vr28, vr24, 0 vpermi.w vr27, vr27, 0xd8 vpermi.w vr28, vr28, 0xd8 vilvl.b vr26, vr28, vr27 //43434343 01010101 vilvl.h vr21, vr26, vr6 //6543 -- -- -- vilvh.h vr22, vr26, vr6 //2101 -- -- -- vilvl.w vr20, vr22, vr21 //65432101 -- vilvh.w vr22, vr22, vr21 //65432101 -- vreplvei.d vr21, vr20, 1 vreplvei.d vr23, vr22, 1 //dst+2,dst+4,dst+3,dst+5 vssrani.bu.h vr31, vr29, 0 vssrani.bu.h vr11, vr30, 0 vpermi.w vr31, vr31, 0xd8 vpermi.w vr11, vr11, 0xd8 vilvl.b vr11, vr11, vr31 //23232323 45454545 vshuf4i.w vr11, vr11, 0xd8 vshuf4i.h vr11, vr11, 0xd8 //2345 -- -- -- vextrins.w vr20, vr11, 0x20 vextrins.w vr21, vr11, 0x21 vextrins.w vr22, vr11, 0x22 vextrins.w vr23, vr11, 0x23 addi.d t5, a0, -6 vld vr6, t5, 0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6 vldx vr7, t5, a1 add.d t5, t5, a1 vldx vr8, t5, a1 add.d t5, t5, a1 vldx vr9, t5, a1 //expand fm to 128 vreplvei.b vr10, vr0, 0 vreplvei.b vr11, vr0, 1 vreplvei.b vr12, vr0, 2 vreplvei.b vr13, vr0, 3 vbitsel.v vr20, vr6, vr20, vr10 vbitsel.v vr21, vr7, vr21, vr11 vbitsel.v vr22, vr8, vr22, vr12 vbitsel.v vr23, vr9, vr23, vr13 addi.d t5, a0, -6 vstelm.d vr20, t5, 0, 0 vstelm.w vr20, t5, 8, 2 add.d t5, t5, a1 vstelm.d vr21, t5, 0, 0 vstelm.w vr21, t5, 8, 2 add.d t5, t5, a1 vstelm.d vr22, t5, 0, 0 vstelm.w vr22, t5, 8, 2 add.d t5, t5, a1 vstelm.d vr23, t5, 0, 0 vstelm.w vr23, t5, 8, 2 .else //reload fld.d f7, sp, 0 fld.d f8, sp, 8 fld.d f9, sp, 16 fld.d f10, sp, 24 fld.d f12, sp, 32 fld.d f13, sp, 40 fld.d f14, sp, 48 fld.d f15, sp, 56 fld.d f16, sp, 64 fld.d f17, sp, 72 fld.d f18, sp, 80 fld.d f19, sp, 88 vssrarni.bu.h vr21, vr21, 0 vssrarni.bu.h vr22, vr22, 0 vssrarni.bu.h vr23, vr23, 0 vssrarni.bu.h vr24, vr24, 0 vssrarni.bu.h vr25, vr25, 0 vssrarni.bu.h vr26, vr26, 0 vssrarni.bu.h vr27, vr27, 0 vssrarni.bu.h vr28, vr28, 0 vssrarni.bu.h vr29, vr29, 0 vssrarni.bu.h vr30, vr30, 0 vssrarni.bu.h vr31, vr31, 0 vssrarni.bu.h vr11, vr11, 0 vbitsel.v vr7, vr7, vr21, vr0 //p5 vbitsel.v vr8, vr8, vr22, vr0 //p4 vbitsel.v vr9, vr9, vr23, vr0 //p3 vbitsel.v vr10, vr10, vr24, vr0 //p2 vbitsel.v vr12, vr12, vr25, vr0 //p1 vbitsel.v vr13, vr13, vr26, vr0 //p0 vbitsel.v vr14, vr14, vr27, vr0 //q0 vbitsel.v vr15, vr15, vr28, vr0 //q1 vbitsel.v vr16, vr16, vr29, vr0 //q2 vbitsel.v vr17, vr17, vr30, vr0 //q3 vbitsel.v vr18, vr18, vr31, vr0 //q4 vbitsel.v vr19, vr19, vr11, vr0 //q5 fst.s f14, a0, 0 fstx.s f15, a0, a1 alsl.d t5, a1, a0, 1 fst.s f16, t5, 0 fstx.s f17, t5, a1 alsl.d t5, a1, t5, 1 fst.s f18, t5, 0 fstx.s f19, t5, a1 slli.w t5, a1, 2 alsl.d t5, a1, t5, 1 sub.d t5, a0, t5 fst.s f7, t5, 0 fstx.s f8, t5, a1 alsl.d t5, a1, t5, 1 fst.s f9, t5, 0 fstx.s f10, t5, a1 alsl.d t5, a1, t5, 1 fst.s f12, t5, 0 fstx.s f13, t5, a1 .endif .END_FILTER_\DIR\()\TYPE\()_W16: .ifc \DIR, v addi.d sp, sp, 96 .endif .endm .macro PUSH_REG addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 .endm .macro POP_REG fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 .endm .macro LPF_FUNC DIR, TYPE function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx PUSH_REG vld vr0, a2, 0 //vmask vpickve2gr.wu t0, vr0, 0 vpickve2gr.wu t1, vr0, 1 vpickve2gr.wu t2, vr0, 2 li.w t3, 1 //y or t0, t0, t1 .ifc \TYPE, y or t0, t0, t2 //vm .endif addi.w t8, t3, -1 andn t8, t0, t8 beqz t0, .\DIR\()\TYPE\()_END .\DIR\()\TYPE\()_LOOP: and t4, t0, t3 //vm & y beqz t4, .\DIR\()\TYPE\()_LOOP_NEXT vldrepl.b vr1, a3, 0 //l[0][0] .ifc \DIR, h addi.d t5, a3, -4 .else slli.d t5, a4, 2 sub.d t5, a3, t5 .endif vldrepl.b vr2, t5, 0 //l[-1][0] vseqi.b vr3, vr1, 0 vbitsel.v vr1, vr1, vr2, vr3 //L vpickve2gr.b t5, vr1, 0 beqz t5, .\DIR\()\TYPE\()_LOOP_NEXT vsrai.b vr2, vr1, 4 //H add.d t6, a5, t5 vldrepl.b vr3, t6, 0 //E addi.d t6, t6, 64 vldrepl.b vr4, t6, 0 //I .ifc \TYPE, y and t5, t2, t3 bnez t5, .FILTER_\DIR\()\TYPE\()_16 .endif and t5, t1, t3 .ifc \TYPE, y bnez t5, .FILTER_\DIR\()\TYPE\()_8 .else bnez t5, .FILTER_\DIR\()\TYPE\()_6 .endif FILTER_W4 \DIR, \TYPE b .\DIR\()\TYPE\()_LOOP_NEXT .ifc \TYPE, uv .FILTER_\DIR\()\TYPE\()_6: FILTER_W6 \DIR, \TYPE .endif .ifc \TYPE, y .FILTER_\DIR\()\TYPE\()_8: FILTER_W8 \DIR, \TYPE b .\DIR\()\TYPE\()_LOOP_NEXT .FILTER_\DIR\()\TYPE\()_16: FILTER_W16 \DIR, \TYPE .endif .\DIR\()\TYPE\()_LOOP_NEXT: slli.w t3, t3, 1 .ifc \DIR, h alsl.d a0, a1, a0, 2 slli.w t8, a4, 2 add.d a3, a3, t8 .else addi.d a0, a0, 4 addi.d a3, a3, 4 .endif addi.w t8, t3, -1 andn t8, t0, t8 bnez t8, .\DIR\()\TYPE\()_LOOP .\DIR\()\TYPE\()_END: POP_REG endfunc .endm LPF_FUNC h, y LPF_FUNC v, y LPF_FUNC h, uv LPF_FUNC v, uv