summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c')
-rw-r--r--media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c1415
1 files changed, 1415 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 0000000000..a07a7e3b41
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1415 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_loop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[12];
+ double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+
+ "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t"
+ "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t"
+ "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t"
+ "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t"
+ "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t"
+
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_03]"=&f"(ff_pb_03)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ mips_reg addr[2];
+ double ftmp[13];
+ double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+ MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+
+ "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t"
+ "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t"
+
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t"
+
+ /* abs (p0-q0) */
+ "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ /* abs (p1-q1) */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp1] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t"
+ "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp0:mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t"
+
+ /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
+ "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1:hev */
+ "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t"
+
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+ "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2:filter_value */
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t"
+ "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t"
+
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t"
+ "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
+ "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t"
+ "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t"
+ "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+
+ /* ftmp5: *op1 ; ftmp6: *op0 */
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ /* ftmp9: *oq0 ; ftmp10: *oq1 */
+ "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ "li %[tmp0], 0x20 \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+ MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+
+ "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t"
+
+ "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+ "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t"
+
+ MMI_ADDIU(%[count], %[count], -0x01)
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_fe]"=&f"(ff_pb_fe)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB \
+ "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \
+ "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \
+ "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \
+ "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg) \
+ "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \
+ "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \
+ "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \
+ "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \
+ "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ uint64_t tmp[1];
+ double ftmp[13];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+ ff_ph_1200, ff_ph_1b00;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0x1200120012001200 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1200] \n\t"
+ "dli %[tmp0], 0x1b001b001b001b00 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "1: \n\t"
+ "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t"
+ /* ftmp1: p3 */
+ "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ /* ftmp3: p2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+ /* ftmp4: p1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+ /* ftmp5: p0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ /* ftmp6: q0 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ /* ftmp7: q1 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ /* ftmp8: q2 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ /* ftmp2: q3 */
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t"
+
+ "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t"
+ "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t"
+ "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t"
+ "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t"
+ "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
+ /* ftmp1: hev */
+ "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t"
+ "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
+ "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t"
+ VP8_MBLOOP_HPSRAB
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+ "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+ "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
+ "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
+ "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03),
+ [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00),
+ [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_VPSRAB_ADDH \
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
+ "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT \
+ "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \
+ "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+ unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+ const unsigned char *limit, const unsigned char *thresh, int count) {
+ mips_reg tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+ double ftmp[14];
+ double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x003f003f003f003f \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_003f] \n\t"
+ "dli %[tmp0], 0x0900090009000900 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_0900] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0303030303030303 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_03] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t"
+ "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t"
+ "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t"
+ "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t"
+ /* ftmp9:q0 ftmp10:q1 */
+ "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t"
+ "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t"
+ /* ftmp11:q2 ftmp12:q3 */
+ "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t"
+ /* srct[0x00]: q3 */
+ "sdc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ /* ftmp1:p3 ftmp2:p2 */
+ "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t"
+ "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
+ /* srct[0x08]: p3 */
+ "sdc1 %[ftmp1], 0x08(%[srct]) \n\t"
+ /* ftmp5:p1 ftmp6:p0 */
+ "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t"
+
+ /* abs (q3-q2) */
+ "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t"
+ /* abs (q2-q1) */
+ "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp3: abs(q1-q0) */
+ "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
+ "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* ftmp4: abs(p1-p0) */
+ "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p2-p1) */
+ "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ /* abs (p3-p2) */
+ "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
+ "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+
+ "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t"
+ /* abs (p0-q0) * 2 */
+ "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t"
+ "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* abs (p1-q1) / 2 */
+ "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t"
+ "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
+ "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t"
+ "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+ "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ /* ftmp0: mask */
+ "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t"
+
+ /* abs(p1-p0) - thresh */
+ "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
+ /* abs(q1-q0) - thresh */
+ "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t"
+ "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
+ /* ftmp1: hev */
+ "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+
+ /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+ "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t"
+ "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t"
+ "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t"
+ "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+
+ "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t"
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+ /* filter_value &= mask */
+ "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ /* Filter2 = filter_value & hev */
+ "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
+ /* filter_value &= ~hev */
+ "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t"
+
+ "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp9: qs0 */
+ "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
+ "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t"
+ "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t"
+ "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
+ "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t"
+ "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t"
+ /* ftmp6: ps0 */
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
+
+ "dli %[tmp0], 0x07 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t"
+ /* ftmp9: oq0 */
+ "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t"
+ /* ftmp6: op0 */
+ "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t"
+ /* ftmp10: oq1 */
+ "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
+ /* ftmp5: op1 */
+ "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t"
+
+ VP8_MBLOOP_VPSRAB_ADDH
+ "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t"
+ "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t"
+ VP8_MBLOOP_VPSRAB_ADDT
+ "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t"
+ /* ftmp11: oq2 */
+ "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t"
+ "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t"
+ /* ftmp2: op2 */
+ "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp12], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp8], 0x08(%[srct]) \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t"
+ "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t"
+ "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t"
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t"
+ "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+ "addiu %[count], %[count], -0x01 \n\t"
+
+ MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr),
+ [count]"+&r"(count),
+ [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900),
+ [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04),
+ [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe)
+ : [limit]"r"(limit), [blimit]"r"(blimit),
+ [srct]"r"(srct), [thresh]"r"(thresh),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_SIMPLE_HPSRAB \
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \
+ "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \
+ "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \
+ "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+/* clang-format on */
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint64_t tmp[1], count = 2;
+ mips_reg addr[2];
+ double ftmp[12];
+ double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t"
+
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
+ "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t"
+ "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
+ "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
+
+ "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
+
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t"
+ "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t"
+
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+ VP8_SIMPLE_HPSRAB
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t"
+ "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "addiu %[count], %[count], -0x01 \n\t"
+ MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
+ : [blimit]"r"(blimit),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+ int src_pixel_step,
+ const unsigned char *blimit) {
+ uint64_t tmp[1], count = 2;
+ mips_reg addr[2];
+ DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+ double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0x08 \n\t"
+ "dmtc1 %[tmp0], %[ftmp8] \n\t"
+ "dli %[tmp0], 0x20 \n\t"
+ "dmtc1 %[tmp0], %[ftmp10] \n\t"
+ "dli %[tmp0], 0xfefefefefefefefe \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_fe] \n\t"
+ "dli %[tmp0], 0x8080808080808080 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_80] \n\t"
+ "dli %[tmp0], 0x0404040404040404 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_04] \n\t"
+ "dli %[tmp0], 0x0101010101010101 \n\t"
+ "dmtc1 %[tmp0], %[ff_pb_01] \n\t"
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+ MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+ "1: \n\t"
+ MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
+
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+
+ "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
+ "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
+ "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t"
+ "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
+
+ "dli %[tmp0], 0x01 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
+ "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t"
+ "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t"
+ "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
+ "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+
+ "sdc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "sdc1 %[ftmp3], 0x08(%[srct]) \n\t"
+
+ "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+
+ "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t"
+ "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t"
+
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
+ "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+ "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t"
+ "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t"
+
+ "dli %[tmp0], 0x03 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t"
+ "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+ "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
+
+ "dli %[tmp0], 0x0b \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
+ "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
+ "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t"
+
+ "ldc1 %[ftmp0], 0x00(%[srct]) \n\t"
+ "ldc1 %[ftmp4], 0x08(%[srct]) \n\t"
+
+ "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
+ "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
+
+ "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t"
+ "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+
+ "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t"
+ MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+ "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
+
+ MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+ "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
+ "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t"
+ "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t"
+
+ "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
+ MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+ "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t"
+ "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t"
+
+ MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+ "addiu %[count], %[count], -0x01 \n\t"
+ "bnez %[count], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [src_ptr]"+&r"(src_ptr), [count]"+&r"(count),
+ [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80),
+ [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01)
+ : [blimit]"r"(blimit), [srct]"r"(srct),
+ [src_pixel_step]"r"((mips_reg)src_pixel_step),
+ [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+ [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+ [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+ lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi) {
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+ blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+ blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}