6 files changed, 2476 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 0000000000..86a32aa9ef
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp0],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp0],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp1],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp1],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp1],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp1],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp1],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp1],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+#define COPY_MEM_8X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "ldl        %[tmp0],    0x07(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+                                                                \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "sdl        %[tmp0],    0x07(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+                           unsigned char *dst, int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[2];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_16X2
+    COPY_MEM_16X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_8X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+
+  /* clang-format off */
+  __asm__ volatile (
+    COPY_MEM_8X2
+    COPY_MEM_8X2
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 0000000000..b9330a6663
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[qcoeff])                 \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[DQC])                    \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[dqcoeff])                \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+      : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+                              int stride) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[dq])                     \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[input])                  \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[input])                  \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[input])                  \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[input])                  \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[input])                  \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[input])                  \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[input])                  \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dq] "r"(dq), [input] "r"(input)
+      : "memory");
+
+  vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+  __asm__ volatile(
+      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "sdl        $0,         0x0f(%[input])                  \n\t"
+      "sdr        $0,         0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp0],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x10(%[input])                  \n\t"
+      "sdl        $0,         0x1f(%[input])                  \n\t"
+      "sdr        $0,         0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0])
+      : [input] "r"(input)
+      : "memory");
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 0000000000..4fd6854c52
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst += 4;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst_u, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst_u += 4;
+    }
+
+    dst_u += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst_v, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst_v += 4;
+    }
+
+    dst_v += 4 * stride - 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 0000000000..a35689dd30
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,335 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+  "pxor          %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
+  MMI_LI(%[tmp0], 0x93)                                            \
+  "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
+  "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
+  "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
+  "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
+  "punpckhwd     %[ftmp4],    %[ftmp6],    %[ftmp8]          \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  double ftmp[12];
+  uint64_t tmp[1];
+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
+
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0004000400040004                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
+    MMI_LI(%[tmp0], 0x02)
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+
+    // ip[0...3] + ip[8...11]
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // ip[0...3] - ip[8...11]
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // (ip[12...15] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
+    // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // b
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // c
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
+    // d
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+#else
+    "gslwlc1    %[ftmp5],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp5],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+    "gslwlc1    %[ftmp6],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp2],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp2],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+#else
+    "gslwlc1    %[ftmp7],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp3],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp3],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+    "gslwlc1    %[ftmp8],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp8],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+    "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp4],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp4],   0x00(%[dst_ptr])                    \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+      [ff_ph_22a3]"=&f"(ff_ph_22a3)
+    : [ip]"r"(input),
+      [pred_stride]"r"((mips_reg)pred_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  int a0 = ((input_dc + 4) >> 3);
+  double a1, ftmp[5];
+  int low32;
+
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "dmtc1      %[a0],      %[a1]                           \n\t"
+    "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
+    : [dst_stride]"r"((mips_reg)dst_stride),
+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
+    : "memory"
+  );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+  int i;
+  int16_t output[16];
+  double ff_ph_03, ftmp[12];
+  uint64_t tmp[1];
+
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0003000300030003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
+    MMI_LI(%[tmp0], 0x03)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp2]            \n\t"
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+    "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubh      %[ftmp8],   %[ftmp3],       %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]            \n\t"
+    // d
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp4]            \n\t"
+    // b
+    "paddh      %[ftmp7],   %[ftmp2],       %[ftmp3]            \n\t"
+    // c
+    "psubh      %[ftmp8],   %[ftmp2],       %[ftmp3]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+    : [ip]"r"(input), [op]"r"(output)
+    : "memory"
+  );
+
+  for (i = 0; i < 16; i++) {
+    mb_dqcoeff[i * 16] = output[i];
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 0000000000..a07a7e3b41
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1415 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_loop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[12];
+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp9],   %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    "gsldlc1    %[ftmp10],  0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp8],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    "paddsb     %[ftmp9],   %[ftmp2],           %[ff_pb_04]         \n\t"
+
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp11],          %[ftmp9]            \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "paddsh     %[ftmp11],  %[ftmp0],           %[ff_ph_01]         \n\t"
+    "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
+    "pandn      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_03]"=&f"(ff_pb_03)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+                                       int src_pixel_step,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[13];
+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
+
+    /* abs (p0-q0) */
+    "pasubub    %[ftmp11],  %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    /* abs (p1-q1) */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp0:mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
+
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    "por        %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1:hev */
+    "pxor       %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp5],           %[ftmp10]           \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2:filter_value */
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
+    "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp12]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp12],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp11]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp11]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp11],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp11]           \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp12]           \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
+    "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
+    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubsb     %[ftmp10],  %[ftmp10],          %[ftmp2]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    /* ftmp5: *op1 ; ftmp6: *op0 */
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    /* ftmp9: *oq0 ; ftmp10: *oq1 */
+    "punpcklbh  %[ftmp4],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpcklhw  %[ftmp2],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "ssrld      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[src_ptr])                        \n\t"
+
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    MMI_ADDIU(%[count], %[count], -0x01)
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_fe]"=&f"(ff_pb_fe)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB                                               \
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  double ftmp[13];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+      ff_ph_1200, ff_ph_1b00;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0x1200120012001200                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"
+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
+
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    /* ftmp1: hev */
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),
+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_VPSRAB_ADDH                                          \
+  "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
+  "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT                                          \
+  "paddh      %[ftmp7],   %[ftmp7],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
+  "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  mips_reg tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[14];
+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* srct[0x00]: q3 */
+    "sdc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* srct[0x08]: p3 */
+    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
+    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* abs (p1-q1) / 2 */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "por        %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
+    "pxor       %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+    "pxor       %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp3],   %[ftmp5],           %[ftmp10]           \n\t"
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
+    "pand       %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
+    "pand       %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
+    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp6: ps0 */
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
+
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
+    "pxor       %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
+    "pxor       %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
+    "pxor       %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
+    "pxor       %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
+    "pxor       %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
+    "pxor       %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp8],   0x08(%[srct])                           \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp6],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp7],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklhw  %[ftmp8],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count),
+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),
+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_SIMPLE_HPSRAB                                               \
+  "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
+  "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
+  "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
+  "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+/* clang-format on */
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+                                                int src_pixel_step,
+                                                const unsigned char *blimit) {
+  uint64_t tmp[1], count = 2;
+  mips_reg addr[2];
+  double ftmp[12];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[blimit])                         \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "pasubub    %[ftmp5],   %[ftmp6],           %[ftmp0]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp2],           %[ftmp7]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
+    : [blimit]"r"(blimit),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+                                              int src_pixel_step,
+                                              const unsigned char *blimit) {
+  uint64_t tmp[1], count = 2;
+  mips_reg addr[2];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "gslwlc1    %[ftmp4],   0x03(%[src_ptr])                        \n\t"
+    "gslwrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp4],   %[ftmp4],           %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp4],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp4],           %[ftmp6]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp7],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp7]            \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "punpckhhw  %[ftmp2],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpcklwd  %[ftmp0],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
+    "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp5],   %[ftmp1],           %[ftmp2]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp6]            \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+
+    "sdc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "sdc1       %[ftmp3],   0x08(%[srct])                           \n\t"
+
+    "pxor       %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+
+    "pxor       %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp7],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp4],   0x08(%[srct])                           \n\t"
+
+    "punpckhbh  %[ftmp1],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp3],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+
+    "punpckhhw  %[ftmp6],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp2]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "ssrld      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
+    : [blimit]"r"(blimit),              [srct]"r"(srct),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                             blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 0000000000..b85f73fdff
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,427 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter:  pixel_step is 1, output_height and output_width are
+   the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+                                             uint16_t *output_ptr,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const int16_t *vp8_filter) {
+  uint64_t tmp[1];
+  double ff_ph_40;
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],        0x0040004000400040                    \n\t"
+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"
+    "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "dli        %[tmp0],        0x07                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"
+    "dli        %[tmp0],        0x08                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
+    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
+    "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
+    "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+/* Horizontal filter:  pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+  double ff_ph_40;
+  uint64_t tmp[1];
+  mips_reg addr[1];
+
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+  register double ftmp12 asm("$f26");
+  register double ftmp13 asm("$f28");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+  register double ftmp12 asm("$f13");
+  register double ftmp13 asm("$f14");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],      0x0040004000400040                      \n\t"
+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"
+    "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "dli        %[tmp0],      0x07                                    \n\t"
+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"
+
+    /* In order to make full use of memory load delay slot,
+     * Operation of memory loading and calculating has been rearranged.
+     */
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
+    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+
+    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+
+    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+
+    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
+
+    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
+
+    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
+
+    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
+
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
+    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
+    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [ff_ph_40]"=&f"(ff_ph_40)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+      [vp8_filter]"r"(vp8_filter),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+   function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+   be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+    unsigned char *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int output_height,
+    unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+
+    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [output_width]"r"(output_width)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format on */
+  __asm__ volatile (
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
+    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+#define sixtapNxM(n, m)                                                        \
+  void vp8_sixtap_predict##n##x##m##_mmi(                                      \
+      unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
+      int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
+    DECLARE_ALIGNED(16, uint16_t,                                              \
+                    FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
+    const int16_t *HFilter, *VFilter;                                          \
+    int i, loop = n / 4;                                                       \
+    HFilter = vp8_six_tap_mmi[xoffset];                                        \
+    VFilter = vp8_six_tap_mmi[yoffset];                                        \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_filter0_mmi(                                     \
+            src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
+            src_pixels_per_line, m + 5, n * 2);                                \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+                                  FData2 + i * 4, src_pixels_per_line, m + 5,  \
+                                  n * 2, HFilter);                             \
+      }                                                                        \
+    }                                                                          \
+    if (yoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_filter0_mmi(                                    \
+            FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+                                   dst_pitch, n * 2, VFilter);                 \
+      }                                                                        \
+    }                                                                          \
+  }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);