summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp8/encoder/mips
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp8/encoder/mips')
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c434
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c263
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c196
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c568
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c167
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c211
-rw-r--r--media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c284
7 files changed, 2123 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 0000000000..0fd25fcda5
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+ Input: ftmp1,ftmp2,ftmp3,ftmp4
+ Output: ftmp1,ftmp2,ftmp3,ftmp4
+ Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
+#define TRANSPOSE_4H \
+ MMI_LI(%[tmp0], 0x93) \
+ "mtc1 %[tmp0], %[ftmp10] \n\t" \
+ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
+ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
+ "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ uint64_t tmp[1];
+ int16_t *ip = input;
+ double ff_ph_op1, ff_ph_op3;
+
+#if _MIPS_SIM == _ABIO32
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f4");
+ register double ftmp3 asm("$f6");
+ register double ftmp4 asm("$f8");
+ register double ftmp5 asm("$f10");
+ register double ftmp6 asm("$f12");
+ register double ftmp7 asm("$f14");
+ register double ftmp8 asm("$f16");
+ register double ftmp9 asm("$f18");
+ register double ftmp10 asm("$f20");
+ register double ftmp11 asm("$f22");
+ register double ftmp12 asm("$f24");
+#else
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f1");
+ register double ftmp2 asm("$f2");
+ register double ftmp3 asm("$f3");
+ register double ftmp4 asm("$f4");
+ register double ftmp5 asm("$f5");
+ register double ftmp6 asm("$f6");
+ register double ftmp7 asm("$f7");
+ register double ftmp8 asm("$f8");
+ register double ftmp9 asm("$f9");
+ register double ftmp10 asm("$f10");
+ register double ftmp11 asm("$f11");
+ register double ftmp12 asm("$f12");
+#endif // _MIPS_SIM == _ABIO32
+
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x14e808a914e808a9 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_op1] \n\t"
+ "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_op3] \n\t"
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ TRANSPOSE_4H
+
+ "ldc1 %[ftmp11], %[ff_ph_8] \n\t"
+ // f1 + f4
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ // a1
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ // f2 + f3
+ "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
+ // b1
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ // f2 - f3
+ "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ // c1
+ "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
+ // f1 - f4
+ "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
+ // d1
+ "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+ // op[0] = a1 + b1
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ // op[2] = a1 - b1
+ "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
+
+ // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
+ MMI_LI(%[tmp0], 0x0c)
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+
+ // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
+ "ldc1 %[ftmp12], %[ff_pw_7500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
+
+ "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_01] \n\t"
+ "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_07] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ MMI_LI(%[tmp0], 0x04)
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+
+ MMI_LI(%[tmp0], 0x10)
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "ldc1 %[ftmp12], %[ff_pw_12000] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+
+ "ldc1 %[ftmp12], %[ff_pw_51000] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t"
+
+ "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t"
+ "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t"
+ "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t"
+
+ : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+ [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+ [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+ [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+ [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+ [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
+ : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+ [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+ [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+ [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+ [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+ : "memory"
+ );
+ /* clang-format on */
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ vp8_short_fdct4x4_mmi(input, output, pitch);
+ vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+ uint64_t tmp[1];
+
+ /* clang-format off */
+ __asm__ volatile (
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
+ "dli %[tmp0], 0x0000000100000001 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_01] \n\t"
+ "dli %[tmp0], 0x0000000300000003 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_03] \n\t"
+ "dli %[tmp0], 0x0001000000010000 \n\t"
+ "dmtc1 %[tmp0], %[ff_pw_mask] \n\t"
+ MMI_LI(%[tmp0], 0x02)
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
+ TRANSPOSE_4H
+
+ "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // d
+ "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ // c
+ "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
+ // b
+ "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t"
+
+ // a + d
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ // b + c
+ "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t"
+ // b - c
+ "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t"
+ // a - d
+ "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+
+ "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ // op[2], op[0]
+ "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t"
+ // op[3], op[1]
+ "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t"
+
+ // op[6], op[4]
+ "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t"
+ // op[7], op[5]
+ "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t"
+
+ // op[10], op[8]
+ "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t"
+ // op[11], op[9]
+ "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t"
+
+ // op[14], op[12]
+ "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t"
+ // op[15], op[13]
+ "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t"
+
+ // a1, a3
+ "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t"
+ // d1, d3
+ "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t"
+ // c1, c3
+ "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t"
+ // b1, b3
+ "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t"
+
+ // a1 + d1, a3 + d3
+ "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t"
+ // b1 + c1, b3 + c3
+ "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t"
+ // b1 - c1, b3 - c3
+ "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ // a1 - d1, a3 - d3
+ "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
+
+ // a2, a4
+ "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
+ // d2, d4
+ "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
+ // c2, c4
+ "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t"
+ // b2, b4
+ "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t"
+
+ // a2 + d2, a4 + d4
+ "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ // b2 + c2, b4 + c4
+ "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t"
+ // b2 - c2, b4 - c4
+ "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t"
+ // a2 - d2, a4 - d4
+ "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+
+ MMI_LI(%[tmp0], 0x03)
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t"
+ "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t"
+ "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t"
+ "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t"
+ "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t"
+ "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t"
+ "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t"
+ "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+
+ "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+
+ MMI_LI(%[tmp0], 0x72)
+ "dmtc1 %[tmp0], %[ftmp11] \n\t"
+ "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
+ "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
+ "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
+ "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
+ "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask),
+ [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01),
+ [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03),
+ [ff_ph_01]"=&f"(ff_ph_01)
+ : [op]"r"(output), [pitch]"r"((mips_reg)pitch)
+ : "memory"
+ );
+ /* clang-format on */
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 0000000000..1986444aa3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc) \
+ z = coeff_ptr[rc]; \
+ sz = (z >> 31); \
+ x = (z ^ sz) - sz; \
+ zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \
+ if (x >= zbin) { \
+ x += round_ptr[rc]; \
+ y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+ if (y) { \
+ x = (y ^ sz) - sz; \
+ qcoeff_ptr[rc] = x; \
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \
+ eob = i; \
+ zbin_boost_ptr = b->zrun_zbin_boost; \
+ } \
+ }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant_fast;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+ double ftmp[13];
+ uint64_t tmp[1];
+ int64_t eob = 0;
+ double ones;
+
+ __asm__ volatile(
+ // loop 0 ~ 7
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "pcmpeqh %[ones], %[ones], %[ones] \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t"
+ "dli %[tmp0], 0x0f \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t"
+
+ // loop 8 ~ 15
+ "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t"
+
+ "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
+ "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
+ "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
+ "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
+ "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t"
+ "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t"
+ "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t"
+ "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t"
+ "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+
+ "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
+ "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
+ "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
+ "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t"
+
+ "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t"
+ "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t"
+ "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t"
+ "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
+ "pxor %[ftmp5], %[ftmp5], %[ones] \n\t"
+ "pxor %[ftmp6], %[ftmp6], %[ones] \n\t"
+ "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
+ "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
+ "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
+
+ "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t"
+ "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t"
+ "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t"
+ "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
+ "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
+ "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t"
+
+ "dli %[tmp0], 0x10 \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "dli %[tmp0], 0xaa \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
+ "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
+ "dli %[tmp0], 0xffff \n\t"
+ "dmtc1 %[tmp0], %[ftmp9] \n\t"
+ "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t"
+ "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t"
+ : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+ [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+ [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+ [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+ [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+ [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
+ : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+ [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+ [dequant_ptr] "r"((mips_reg)dequant_ptr),
+ [round_ptr] "r"((mips_reg)round_ptr),
+ [quant_ptr] "r"((mips_reg)quant_ptr),
+ [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+ [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
+ : "memory");
+
+ *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+ int eob = 0;
+ int x, y, z, sz, zbin;
+ const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+ const int16_t *coeff_ptr = b->coeff;
+ const int16_t *zbin_ptr = b->zbin;
+ const int16_t *round_ptr = b->round;
+ const int16_t *quant_ptr = b->quant;
+ const int16_t *quant_shift_ptr = b->quant_shift;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ const int16_t *dequant_ptr = d->dequant;
+ const int16_t zbin_oq_value = b->zbin_extra;
+ register double ftmp0 asm("$f0");
+
+ // memset(qcoeff_ptr, 0, 32);
+ // memset(dqcoeff_ptr, 0, 32);
+ /* clang-format off */
+ __asm__ volatile (
+ "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t"
+
+ "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t"
+ "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t"
+ "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t"
+ : [ftmp0]"=&f"(ftmp0)
+ : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+ : "memory"
+ );
+ /* clang-format on */
+
+ REGULAR_SELECT_EOB(1, 0);
+ REGULAR_SELECT_EOB(2, 1);
+ REGULAR_SELECT_EOB(3, 4);
+ REGULAR_SELECT_EOB(4, 8);
+ REGULAR_SELECT_EOB(5, 5);
+ REGULAR_SELECT_EOB(6, 2);
+ REGULAR_SELECT_EOB(7, 3);
+ REGULAR_SELECT_EOB(8, 6);
+ REGULAR_SELECT_EOB(9, 9);
+ REGULAR_SELECT_EOB(10, 12);
+ REGULAR_SELECT_EOB(11, 13);
+ REGULAR_SELECT_EOB(12, 10);
+ REGULAR_SELECT_EOB(13, 7);
+ REGULAR_SELECT_EOB(14, 11);
+ REGULAR_SELECT_EOB(15, 14);
+ REGULAR_SELECT_EOB(16, 15);
+
+ *d->eob = (char)eob;
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c
new file mode 100644
index 0000000000..3084667552
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/dct_msa.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \
+ \
+ ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \
+ ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \
+ PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \
+ PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \
+ }
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \
+ { \
+ v8i16 tmp0_m; \
+ \
+ SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \
+ ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \
+ }
+
+#define RET_1_IF_NZERO_H(in0) \
+ ({ \
+ v8i16 tmp0_m; \
+ v8i16 one_m = __msa_ldi_h(1); \
+ \
+ tmp0_m = __msa_ceqi_h(in0, 0); \
+ tmp0_m = tmp0_m ^ 255; \
+ tmp0_m = one_m & tmp0_m; \
+ \
+ tmp0_m; \
+ })
+
+#define RET_1_IF_NZERO_W(in0) \
+ ({ \
+ v4i32 tmp0_m; \
+ v4i32 one_m = __msa_ldi_w(1); \
+ \
+ tmp0_m = __msa_ceqi_w(in0, 0); \
+ tmp0_m = tmp0_m ^ 255; \
+ tmp0_m = one_m & tmp0_m; \
+ \
+ tmp0_m; \
+ })
+
+#define RET_1_IF_NEG_W(in0) \
+ ({ \
+ v4i32 tmp0_m; \
+ \
+ v4i32 one_m = __msa_ldi_w(1); \
+ tmp0_m = __msa_clti_s_w(in0, 0); \
+ tmp0_m = one_m & tmp0_m; \
+ \
+ tmp0_m; \
+ })
+
+void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
+ v8i16 in0, in1, in2, in3;
+ v8i16 temp0, temp1;
+ v8i16 const0, const1;
+ v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+ v4i32 out0, out1, out2, out3;
+ v8i16 zero = { 0 };
+
+ LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+ SLLI_4V(temp0, temp1, in1, in3, 3);
+ in0 = temp0 + temp1;
+ in2 = temp0 - temp1;
+ SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+ temp0 = __msa_ilvr_h(in3, in1);
+ in1 = __msa_splati_h(coeff, 3);
+ out0 = (v4i32)__msa_ilvev_h(zero, in1);
+ coeff = __msa_ilvl_h(zero, coeff);
+ out1 = __msa_splati_w((v4i32)coeff, 0);
+ DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
+ out0 >>= 12;
+ out1 >>= 12;
+ PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+ in0 = temp0 + temp1 + 7;
+ in2 = temp0 - temp1 + 7;
+ in0 >>= 4;
+ in2 >>= 4;
+ ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
+ temp1 = RET_1_IF_NZERO_H(in3);
+ ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
+ SPLATI_W2_SW(coeff, 2, out3, out1);
+ out3 += out1;
+ out1 = __msa_splati_w((v4i32)coeff, 1);
+ DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
+ out1 >>= 16;
+ out3 >>= 16;
+ out1 += (v4i32)temp1;
+ PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
+
+void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
+ v8i16 in0, in1, in2, in3;
+ v8i16 temp0, temp1, tmp0, tmp1;
+ v8i16 const0, const1, const2;
+ v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+ v8i16 zero = { 0 };
+ v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
+
+ LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+ TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+ SLLI_4V(temp0, temp1, in1, in3, 3);
+ in0 = temp0 + temp1;
+ in2 = temp0 - temp1;
+ SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+ temp0 = __msa_splati_h(coeff, 3);
+ vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
+ coeff = __msa_ilvl_h(zero, coeff);
+ vec3_w = __msa_splati_w((v4i32)coeff, 0);
+ ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+ vec0_w = vec1_w;
+ vec2_w = vec3_w;
+ DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
+ vec1_w, vec2_w, vec3_w);
+ SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
+ PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+ TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+ in0 = temp0 + temp1 + 7;
+ in2 = temp0 - temp1 + 7;
+ in0 >>= 4;
+ in2 >>= 4;
+ SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
+ vec3_w += vec1_w;
+ vec1_w = __msa_splati_w((v4i32)coeff, 1);
+ const0 = RET_1_IF_NZERO_H(in3);
+ ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+ vec0_w = vec1_w;
+ vec2_w = vec3_w;
+ DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
+ vec1_w, vec2_w, vec3_w);
+ SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
+ PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+ in1 += const0;
+ PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
+ ST_SH2(temp0, temp1, output, 8);
+
+ PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output + 16, 8);
+}
+
+void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
+ v8i16 in0_h, in1_h, in2_h, in3_h;
+ v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
+
+ LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
+ TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
+
+ UNPCK_R_SH_SW(in0_h, in0_w);
+ UNPCK_R_SH_SW(in1_h, in1_w);
+ UNPCK_R_SH_SW(in2_h, in2_w);
+ UNPCK_R_SH_SW(in3_h, in3_w);
+ BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+ SLLI_4V(temp0, temp1, temp2, temp3, 2);
+ BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+ temp0 = RET_1_IF_NZERO_W(temp0);
+ in0_w += temp0;
+ TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
+
+ BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+ BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+ in0_w += RET_1_IF_NEG_W(in0_w);
+ in1_w += RET_1_IF_NEG_W(in1_w);
+ in2_w += RET_1_IF_NEG_W(in2_w);
+ in3_w += RET_1_IF_NEG_W(in3_w);
+ ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
+ SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
+ PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
+ ST_SH2(in0_h, in1_h, output, 8);
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c
new file mode 100644
index 0000000000..f8b653a9a7
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/denoising_msa.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/denoising.h"
+
+int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr,
+ int32_t mc_avg_y_stride,
+ uint8_t *running_avg_y_ptr,
+ int32_t avg_y_stride, uint8_t *sig_ptr,
+ int32_t sig_stride, uint32_t motion_magnitude,
+ int32_t increase_denoising) {
+ uint8_t *running_avg_y_start = running_avg_y_ptr;
+ uint8_t *sig_start = sig_ptr;
+ int32_t cnt = 0;
+ int32_t sum_diff = 0;
+ int32_t shift_inc1 = 3;
+ int32_t delta = 0;
+ int32_t sum_diff_thresh;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 mc_running_avg_y0, running_avg_y, sig0;
+ v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+ v16u8 coeff0, coeff1;
+ v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1;
+ v8i16 adjust0, adjust1, adjust2, adjust3;
+ v8i16 shift_inc1_vec = { 0 };
+ v8i16 col_sum0 = { 0 };
+ v8i16 col_sum1 = { 0 };
+ v8i16 col_sum2 = { 0 };
+ v8i16 col_sum3 = { 0 };
+ v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec;
+ v4i32 temp0_w;
+ v2i64 temp0_d, temp1_d;
+ v8i16 zero = { 0 };
+ v8i16 one = __msa_ldi_h(1);
+ v8i16 four = __msa_ldi_h(4);
+ v8i16 val_127 = __msa_ldi_h(127);
+ v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ adj_val = __msa_add_a_h(adj_val, one);
+ if (increase_denoising) {
+ adj_val = __msa_add_a_h(adj_val, one);
+ shift_inc1 = 4;
+ }
+
+ temp0_h = zero - adj_val;
+ adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+ }
+
+ adj_val = __msa_insert_h(adj_val, 3, cnt);
+ adj_val = __msa_insert_h(adj_val, 7, cnt);
+ shift_inc1_vec = __msa_fill_h(shift_inc1);
+
+ for (cnt = 8; cnt--;) {
+ v8i16 mask0 = { 0 };
+ v8i16 mask1 = { 0 };
+
+ mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+ sig0 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+
+ mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+ sig1 = LD_UB(sig_ptr);
+
+ ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+ HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+ abs_diff0 = __msa_add_a_h(diff0, zero);
+ abs_diff1 = __msa_add_a_h(diff1, zero);
+ cmp = __msa_clei_s_h(abs_diff0, 15);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff0, 7);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = abs_diff0 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff1, 15);
+ cmp = cmp & one;
+ mask1 += cmp;
+ cmp = __msa_clei_s_h(abs_diff1, 7);
+ cmp = cmp & one;
+ mask1 += cmp;
+ cmp = abs_diff1 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask1 += cmp;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp0_h = temp0_h & four;
+ mask0 += temp0_h;
+ temp1_h = __msa_clei_s_h(diff1, 0);
+ temp1_h = temp1_h & four;
+ mask1 += temp1_h;
+ VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+ adjust1);
+ temp2_h = __msa_ceqi_h(adjust0, 0);
+ temp3_h = __msa_ceqi_h(adjust1, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
+ adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
+ ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+ UNPCK_UB_SH(sig0, temp0_h, temp1_h);
+ ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+ MAXI_SH2_SH(temp0_h, temp1_h, 0);
+ SAT_UH2_SH(temp0_h, temp1_h, 7);
+ temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+ running_avg_y =
+ __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
+ ST_UB(running_avg_y, running_avg_y_ptr);
+ running_avg_y_ptr += avg_y_stride;
+
+ mask0 = zero;
+ mask1 = zero;
+ ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+ HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+ abs_diff0 = __msa_add_a_h(diff0, zero);
+ abs_diff1 = __msa_add_a_h(diff1, zero);
+ cmp = __msa_clei_s_h(abs_diff0, 15);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff0, 7);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = abs_diff0 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff1, 15);
+ cmp = cmp & one;
+ mask1 += cmp;
+ cmp = __msa_clei_s_h(abs_diff1, 7);
+ cmp = cmp & one;
+ mask1 += cmp;
+ cmp = abs_diff1 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask1 += cmp;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp0_h = temp0_h & four;
+ mask0 += temp0_h;
+ temp1_h = __msa_clei_s_h(diff1, 0);
+ temp1_h = temp1_h & four;
+ mask1 += temp1_h;
+ VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+ adjust1);
+ temp2_h = __msa_ceqi_h(adjust0, 0);
+ temp3_h = __msa_ceqi_h(adjust1, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
+ adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1, (v16u8)temp3_h);
+ ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+ UNPCK_UB_SH(sig1, temp0_h, temp1_h);
+ ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+ MAXI_SH2_SH(temp0_h, temp1_h, 0);
+ SAT_UH2_SH(temp0_h, temp1_h, 7);
+ temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+ running_avg_y =
+ __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
+ ST_UB(running_avg_y, running_avg_y_ptr);
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+ running_avg_y_ptr += avg_y_stride;
+ }
+
+ col_sum0 = __msa_min_s_h(col_sum0, val_127);
+ col_sum1 = __msa_min_s_h(col_sum1, val_127);
+ temp0_h = col_sum0 + col_sum1;
+ temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+ temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+ temp1_d = __msa_splati_d(temp0_d, 1);
+ temp0_d += temp1_d;
+ sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+ sig_ptr -= sig_stride * 16;
+ mc_running_avg_y_ptr -= mc_avg_y_stride * 16;
+ running_avg_y_ptr -= avg_y_stride * 16;
+
+ if (increase_denoising) {
+ sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+ delta_vec = __msa_fill_h(delta);
+ if (delta < 4) {
+ for (cnt = 8; cnt--;) {
+ running_avg_y = LD_UB(running_avg_y_ptr);
+ mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+ sig0 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+ running_avg_y_ptr += avg_y_stride;
+ mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+ sig1 = LD_UB(sig_ptr);
+ running_avg_y1 = LD_UB(running_avg_y_ptr);
+ ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+ HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+ abs_diff0 = __msa_add_a_h(diff0, zero);
+ abs_diff1 = __msa_add_a_h(diff1, zero);
+ temp0_h = abs_diff0 < delta_vec;
+ temp1_h = abs_diff1 < delta_vec;
+ abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
+ (v16u8)temp0_h);
+ abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
+ (v16u8)temp1_h);
+ SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
+ abs_diff_neg0 = zero - abs_diff0;
+ abs_diff_neg1 = zero - abs_diff1;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp1_h = __msa_clei_s_h(diff1, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
+ (v16u8)temp0_h);
+ adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
+ (v16u8)temp1_h);
+ ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h);
+ ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+ MAXI_SH2_SH(adjust2, adjust3, 0);
+ SAT_UH2_SH(adjust2, adjust3, 7);
+ temp0_h = __msa_ceqi_h(diff0, 0);
+ temp1_h = __msa_ceqi_h(diff1, 0);
+ adjust2 =
+ (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
+ adjust3 =
+ (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
+ adjust0 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
+ adjust1 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
+ ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
+ ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride);
+ ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+ HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+ abs_diff0 = __msa_add_a_h(diff0, zero);
+ abs_diff1 = __msa_add_a_h(diff1, zero);
+ temp0_h = abs_diff0 < delta_vec;
+ temp1_h = abs_diff1 < delta_vec;
+ abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)delta_vec,
+ (v16u8)temp0_h);
+ abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1, (v16u8)delta_vec,
+ (v16u8)temp1_h);
+ SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0, abs_diff_neg1);
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp1_h = __msa_clei_s_h(diff1, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
+ (v16u8)temp0_h);
+ adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1, (v16u8)abs_diff_neg1,
+ (v16u8)temp1_h);
+ ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h);
+ ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+ MAXI_SH2_SH(adjust2, adjust3, 0);
+ SAT_UH2_SH(adjust2, adjust3, 7);
+ temp0_h = __msa_ceqi_h(diff0, 0);
+ temp1_h = __msa_ceqi_h(diff1, 0);
+ adjust2 =
+ (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
+ adjust3 =
+ (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h, (v16u8)temp1_h);
+ adjust0 =
+ (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
+ adjust1 =
+ (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero, (v16u8)temp1_h);
+ ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3, (v16i8)adjust2);
+ ST_UB(running_avg_y, running_avg_y_ptr);
+ running_avg_y_ptr += avg_y_stride;
+ }
+
+ col_sum2 = __msa_min_s_h(col_sum2, val_127);
+ col_sum3 = __msa_min_s_h(col_sum3, val_127);
+ temp0_h = col_sum2 + col_sum3;
+ temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+ temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+ temp1_d = __msa_splati_d(temp0_d, 1);
+ temp0_d += (v2i64)temp1_d;
+ sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+ if (abs(sum_diff) > SUM_DIFF_THRESHOLD) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+
+ LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ sig_start += (8 * sig_stride);
+ LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13, src14,
+ src15);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start,
+ avg_y_stride);
+ running_avg_y_start += (8 * avg_y_stride);
+ ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+ running_avg_y_start, avg_y_stride);
+
+ return FILTER_BLOCK;
+}
+
+int32_t vp8_denoiser_filter_uv_msa(
+ uint8_t *mc_running_avg_y_ptr, int32_t mc_avg_y_stride,
+ uint8_t *running_avg_y_ptr, int32_t avg_y_stride, uint8_t *sig_ptr,
+ int32_t sig_stride, uint32_t motion_magnitude, int32_t increase_denoising) {
+ uint8_t *running_avg_y_start = running_avg_y_ptr;
+ uint8_t *sig_start = sig_ptr;
+ int32_t cnt = 0;
+ int32_t sum_diff = 0;
+ int32_t shift_inc1 = 3;
+ int32_t delta = 0;
+ int32_t sum_block = 0;
+ int32_t sum_diff_thresh;
+ int64_t dst0, dst1, src0, src1, src2, src3;
+ v16u8 mc_running_avg_y0, running_avg_y, sig0;
+ v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+ v16u8 sig2, sig3, sig4, sig5, sig6, sig7;
+ v16u8 coeff0;
+ v8i16 diff0, abs_diff0, abs_diff_neg0;
+ v8i16 adjust0, adjust2;
+ v8i16 shift_inc1_vec = { 0 };
+ v8i16 col_sum0 = { 0 };
+ v8i16 temp0_h, temp2_h, cmp, delta_vec;
+ v4i32 temp0_w;
+ v2i64 temp0_d, temp1_d;
+ v16i8 zero = { 0 };
+ v8i16 one = __msa_ldi_h(1);
+ v8i16 four = __msa_ldi_h(4);
+ v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+ sig0 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+ sig1 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+ sig2 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2);
+ sig3 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3);
+ sig4 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4);
+ sig5 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5);
+ sig6 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6);
+ sig7 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7);
+ temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+ temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+ temp1_d = __msa_splati_d(temp0_d, 1);
+ temp0_d += temp1_d;
+ sum_block = __msa_copy_s_w((v4i32)temp0_d, 0);
+ sig_ptr -= sig_stride * 8;
+
+ if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+ return COPY_BLOCK;
+ }
+
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ adj_val = __msa_add_a_h(adj_val, one);
+
+ if (increase_denoising) {
+ adj_val = __msa_add_a_h(adj_val, one);
+ shift_inc1 = 4;
+ }
+
+ temp0_h = (v8i16)zero - adj_val;
+ adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+ }
+
+ adj_val = __msa_insert_h(adj_val, 3, cnt);
+ adj_val = __msa_insert_h(adj_val, 7, cnt);
+ shift_inc1_vec = __msa_fill_h(shift_inc1);
+ for (cnt = 4; cnt--;) {
+ v8i16 mask0 = { 0 };
+ mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+ sig0 = LD_UB(sig_ptr);
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+ mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+ sig1 = LD_UB(sig_ptr);
+ coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
+ diff0 = __msa_hsub_u_h(coeff0, coeff0);
+ abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+ cmp = __msa_clei_s_h(abs_diff0, 15);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff0, 7);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = abs_diff0 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask0 += cmp;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp0_h = temp0_h & four;
+ mask0 += temp0_h;
+ adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+ temp2_h = __msa_ceqi_h(adjust0, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
+ col_sum0 += adjust0;
+ temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+ temp0_h += adjust0;
+ temp0_h = __msa_maxi_s_h(temp0_h, 0);
+ temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+ temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+ running_avg_y =
+ __msa_bmnz_v(running_avg_y, mc_running_avg_y0, (v16u8)temp2_h);
+ dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+ SD(dst0, running_avg_y_ptr);
+ running_avg_y_ptr += avg_y_stride;
+
+ mask0 = __msa_ldi_h(0);
+ coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
+ diff0 = __msa_hsub_u_h(coeff0, coeff0);
+ abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+ cmp = __msa_clei_s_h(abs_diff0, 15);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = __msa_clei_s_h(abs_diff0, 7);
+ cmp = cmp & one;
+ mask0 += cmp;
+ cmp = abs_diff0 < shift_inc1_vec;
+ cmp = cmp & one;
+ mask0 += cmp;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ temp0_h = temp0_h & four;
+ mask0 += temp0_h;
+ adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+ temp2_h = __msa_ceqi_h(adjust0, 0);
+ adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0, (v16u8)temp2_h);
+ col_sum0 += adjust0;
+ temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+ temp0_h += adjust0;
+ temp0_h = __msa_maxi_s_h(temp0_h, 0);
+ temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+
+ temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+ running_avg_y =
+ __msa_bmnz_v(running_avg_y, mc_running_avg_y1, (v16u8)temp2_h);
+ dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+ SD(dst1, running_avg_y_ptr);
+
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+ running_avg_y_ptr += avg_y_stride;
+ }
+
+ temp0_h = col_sum0;
+ temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+ temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+ temp1_d = __msa_splati_d(temp0_d, 1);
+ temp0_d += temp1_d;
+ sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+ sig_ptr -= sig_stride * 8;
+ mc_running_avg_y_ptr -= mc_avg_y_stride * 8;
+ running_avg_y_ptr -= avg_y_stride * 8;
+ sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+
+ if (increase_denoising) {
+ sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+ delta_vec = __msa_fill_h(delta);
+ if (delta < 4) {
+ for (cnt = 4; cnt--;) {
+ running_avg_y = LD_UB(running_avg_y_ptr);
+ mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+ sig0 = LD_UB(sig_ptr);
+ /* Update pointers for next iteration. */
+ sig_ptr += sig_stride;
+ mc_running_avg_y_ptr += mc_avg_y_stride;
+ running_avg_y_ptr += avg_y_stride;
+
+ mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+ sig1 = LD_UB(sig_ptr);
+ running_avg_y1 = LD_UB(running_avg_y_ptr);
+
+ coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
+ diff0 = __msa_hsub_u_h(coeff0, coeff0);
+ abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+ temp0_h = delta_vec < abs_diff0;
+ abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
+ (v16u8)temp0_h);
+ abs_diff_neg0 = (v8i16)zero - abs_diff0;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
+ (v16u8)temp0_h);
+ temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y);
+ adjust2 = temp2_h + adjust0;
+ adjust2 = __msa_maxi_s_h(adjust2, 0);
+ adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+ temp0_h = __msa_ceqi_h(diff0, 0);
+ adjust2 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
+ adjust0 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
+ col_sum0 += adjust0;
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
+ dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+ SD(dst0, running_avg_y_ptr - avg_y_stride);
+
+ coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
+ diff0 = __msa_hsub_u_h(coeff0, coeff0);
+ abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+ temp0_h = delta_vec < abs_diff0;
+ abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0, (v16u8)delta_vec,
+ (v16u8)temp0_h);
+ abs_diff_neg0 = (v8i16)zero - abs_diff0;
+ temp0_h = __msa_clei_s_h(diff0, 0);
+ adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0, (v16u8)abs_diff_neg0,
+ (v16u8)temp0_h);
+ temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1);
+ adjust2 = temp2_h + adjust0;
+ adjust2 = __msa_maxi_s_h(adjust2, 0);
+ adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+ temp0_h = __msa_ceqi_h(diff0, 0);
+ adjust2 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h, (v16u8)temp0_h);
+ adjust0 =
+ (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero, (v16u8)temp0_h);
+ col_sum0 += adjust0;
+ running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2, (v16i8)adjust2);
+ dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+ SD(dst1, running_avg_y_ptr);
+ running_avg_y_ptr += avg_y_stride;
+ }
+
+ temp0_h = col_sum0;
+ temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+ temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+ temp1_d = __msa_splati_d(temp0_d, 1);
+ temp0_d += temp1_d;
+ sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+
+ LD4(sig_start, sig_stride, src0, src1, src2, src3);
+ sig_start += (4 * sig_stride);
+ SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+ running_avg_y_start += (4 * avg_y_stride);
+
+ LD4(sig_start, sig_stride, src0, src1, src2, src3);
+ SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+
+ return FILTER_BLOCK;
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c
new file mode 100644
index 0000000000..2bcddb6235
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) {
+ int32_t err = 0;
+ uint32_t loop_cnt;
+ v8i16 coeff, dq_coeff, coeff0, coeff1;
+ v4i32 diff0, diff1;
+ v2i64 err0 = { 0 };
+ v2i64 err1 = { 0 };
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ coeff = LD_SH(coeff_ptr);
+ dq_coeff = LD_SH(dq_coeff_ptr);
+ ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DPADD_SD2_SD(diff0, diff1, err0, err1);
+ coeff_ptr += 8;
+ dq_coeff_ptr += 8;
+ }
+
+ err0 += __msa_splati_d(err0, 1);
+ err1 += __msa_splati_d(err1, 1);
+ err = __msa_copy_s_d(err0, 0);
+ err += __msa_copy_s_d(err1, 0);
+
+ return err;
+}
+
+int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc) {
+ BLOCK *be;
+ BLOCKD *bd;
+ int16_t *coeff_ptr, *dq_coeff_ptr;
+ int32_t err = 0;
+ uint32_t loop_cnt;
+ v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+ v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+ v4i32 diff0, diff1;
+ v2i64 err0, err1;
+ v16u8 zero = { 0 };
+ v16u8 mask0 = (v16u8)__msa_ldi_b(255);
+
+ if (1 == dc) {
+ mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero);
+ }
+
+ for (loop_cnt = 0; loop_cnt < 8; ++loop_cnt) {
+ be = &mb->block[2 * loop_cnt];
+ bd = &mb->e_mbd.block[2 * loop_cnt];
+ coeff_ptr = be->coeff;
+ dq_coeff_ptr = bd->dqcoeff;
+ coeff = LD_SH(coeff_ptr);
+ dq_coeff = LD_SH(dq_coeff_ptr);
+ coeff_ptr += 8;
+ dq_coeff_ptr += 8;
+ coeff2 = LD_SH(coeff_ptr);
+ dq_coeff2 = LD_SH(dq_coeff_ptr);
+ be = &mb->block[2 * loop_cnt + 1];
+ bd = &mb->e_mbd.block[2 * loop_cnt + 1];
+ coeff_ptr = be->coeff;
+ dq_coeff_ptr = bd->dqcoeff;
+ coeff3 = LD_SH(coeff_ptr);
+ dq_coeff3 = LD_SH(dq_coeff_ptr);
+ coeff_ptr += 8;
+ dq_coeff_ptr += 8;
+ coeff4 = LD_SH(coeff_ptr);
+ dq_coeff4 = LD_SH(dq_coeff_ptr);
+ ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+ DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+ ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DPADD_SD2_SD(diff0, diff1, err0, err1);
+ err0 += __msa_splati_d(err0, 1);
+ err1 += __msa_splati_d(err1, 1);
+ err += __msa_copy_s_d(err0, 0);
+ err += __msa_copy_s_d(err1, 0);
+
+ ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+ DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+ ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DPADD_SD2_SD(diff0, diff1, err0, err1);
+ err0 += __msa_splati_d(err0, 1);
+ err1 += __msa_splati_d(err1, 1);
+ err += __msa_copy_s_d(err0, 0);
+ err += __msa_copy_s_d(err1, 0);
+ }
+
+ return err;
+}
+
+int32_t vp8_mbuverror_msa(MACROBLOCK *mb) {
+ BLOCK *be;
+ BLOCKD *bd;
+ int16_t *coeff_ptr, *dq_coeff_ptr;
+ int32_t err = 0;
+ uint32_t loop_cnt;
+ v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+ v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+ v4i32 diff0, diff1;
+ v2i64 err0, err1, err_dup0, err_dup1;
+
+ for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2) {
+ be = &mb->block[loop_cnt];
+ bd = &mb->e_mbd.block[loop_cnt];
+ coeff_ptr = be->coeff;
+ dq_coeff_ptr = bd->dqcoeff;
+ coeff = LD_SH(coeff_ptr);
+ dq_coeff = LD_SH(dq_coeff_ptr);
+ coeff_ptr += 8;
+ dq_coeff_ptr += 8;
+ coeff2 = LD_SH(coeff_ptr);
+ dq_coeff2 = LD_SH(dq_coeff_ptr);
+ be = &mb->block[loop_cnt + 1];
+ bd = &mb->e_mbd.block[loop_cnt + 1];
+ coeff_ptr = be->coeff;
+ dq_coeff_ptr = bd->dqcoeff;
+ coeff3 = LD_SH(coeff_ptr);
+ dq_coeff3 = LD_SH(dq_coeff_ptr);
+ coeff_ptr += 8;
+ dq_coeff_ptr += 8;
+ coeff4 = LD_SH(coeff_ptr);
+ dq_coeff4 = LD_SH(dq_coeff_ptr);
+
+ ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+
+ ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DPADD_SD2_SD(diff0, diff1, err0, err1);
+ err_dup0 = __msa_splati_d(err0, 1);
+ err_dup1 = __msa_splati_d(err1, 1);
+ ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+ err += __msa_copy_s_d(err0, 0);
+ err += __msa_copy_s_d(err1, 0);
+
+ ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+ ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+ HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+ DPADD_SD2_SD(diff0, diff1, err0, err1);
+ err_dup0 = __msa_splati_d(err0, 1);
+ err_dup1 = __msa_splati_d(err1, 1);
+ ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+ err += __msa_copy_s_d(err0, 0);
+ err += __msa_copy_s_d(err1, 0);
+ }
+
+ return err;
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c
new file mode 100644
index 0000000000..9f5fbd39c8
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/quantize_msa.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round,
+ int16_t *quant, int16_t *de_quant,
+ int16_t *q_coeff, int16_t *dq_coeff) {
+ int32_t cnt, eob;
+ v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
+ v8i16 round0, round1;
+ v8i16 sign_z0, sign_z1;
+ v8i16 q_coeff0, q_coeff1;
+ v8i16 x0, x1, de_quant0, de_quant1;
+ v8i16 coeff0, coeff1, z0, z1;
+ v8i16 quant0, quant1, quant2, quant3;
+ v8i16 zero = { 0 };
+ v8i16 inv_zig_zag0, inv_zig_zag1;
+ v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+ v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+ v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+ v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+ ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+ eob = -1;
+ LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
+ z1);
+ LD_SH2(round, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
+ round1);
+ LD_SH2(quant, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
+ quant2);
+ sign_z0 = z0 >> 15;
+ sign_z1 = z1 >> 15;
+ x0 = __msa_add_a_h(z0, zero);
+ x1 = __msa_add_a_h(z1, zero);
+ ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+ ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+ ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+ ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+ DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+ quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+ SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+ PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+ x0 = x0 ^ sign_z0;
+ x1 = x1 ^ sign_z1;
+ SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
+ VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
+ ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+ LD_SH2(de_quant, 8, de_quant0, de_quant1);
+ q_coeff0 *= de_quant0;
+ q_coeff1 *= de_quant1;
+ ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
+
+ for (cnt = 0; cnt < 16; ++cnt) {
+ if ((cnt <= 7) && (x1[7 - cnt] != 0)) {
+ eob = (15 - cnt);
+ break;
+ }
+
+ if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) {
+ eob = (7 - (cnt - 8));
+ break;
+ }
+ }
+
+ return (int8_t)(eob + 1);
+}
+
+static int8_t exact_regular_quantize_b_msa(
+ int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
+ int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
+ int16_t *q_coeff, int16_t *dq_coeff) {
+ int32_t cnt, eob;
+ int16_t *boost_temp = zbin_boost;
+ v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
+ v8i16 round0, round1;
+ v8i16 sign_z0, sign_z1;
+ v8i16 q_coeff0, q_coeff1;
+ v8i16 z_bin0, z_bin1, zbin_o_q;
+ v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
+ v8i16 coeff0, coeff1, z0, z1;
+ v8i16 quant0, quant1, quant2, quant3;
+ v8i16 zero = { 0 };
+ v8i16 inv_zig_zag0, inv_zig_zag1;
+ v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+ v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+ v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+ v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+ ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+ zbin_o_q = __msa_fill_h(zbin_oq_in);
+ eob = -1;
+ LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
+ z1);
+ LD_SH2(round, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
+ round1);
+ LD_SH2(quant, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
+ quant2);
+ LD_SH2(zbin, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z_bin0,
+ z_bin1);
+ sign_z0 = z0 >> 15;
+ sign_z1 = z1 >> 15;
+ x0 = __msa_add_a_h(z0, zero);
+ x1 = __msa_add_a_h(z1, zero);
+ SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+ SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+ ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+ ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+ ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+ ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+ DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+ quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+ SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+ PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
+ LD_SH2(quant_shift, 8, coeff0, coeff1);
+ VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
+ quant2);
+ ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+ ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+ ADD2(x0, round0, x1, round1, x0, x1);
+ ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
+ ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
+ DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+ quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+ SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+ PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+ sign_x0 = x0 ^ sign_z0;
+ sign_x1 = x1 ^ sign_z1;
+ SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+ for (cnt = 0; cnt < 16; ++cnt) {
+ if (cnt <= 7) {
+ if (boost_temp[0] <= z_bin0[cnt]) {
+ if (x0[cnt]) {
+ eob = cnt;
+ boost_temp = zbin_boost;
+ } else {
+ boost_temp++;
+ }
+ } else {
+ sign_x0[cnt] = 0;
+ boost_temp++;
+ }
+ } else {
+ if (boost_temp[0] <= z_bin1[cnt - 8]) {
+ if (x1[cnt - 8]) {
+ eob = cnt;
+ boost_temp = zbin_boost;
+ } else {
+ boost_temp++;
+ }
+ } else {
+ sign_x1[cnt - 8] = 0;
+ boost_temp++;
+ }
+ }
+ }
+
+ VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
+ q_coeff0, q_coeff1);
+ ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+ LD_SH2(de_quant, 8, de_quant0, de_quant1);
+ MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
+ ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
+
+ return (int8_t)(eob + 1);
+}
+
+void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) {
+ int16_t *coeff_ptr = b->coeff;
+ int16_t *round_ptr = b->round;
+ int16_t *quant_ptr = b->quant_fast;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ int16_t *dequant_ptr = d->dequant;
+
+ *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr,
+ qcoeff_ptr, dqcoeff_ptr);
+}
+
+void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) {
+ int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+ int16_t *coeff_ptr = b->coeff;
+ int16_t *zbin_ptr = b->zbin;
+ int16_t *round_ptr = b->round;
+ int16_t *quant_ptr = b->quant;
+ int16_t *quant_shift_ptr = b->quant_shift;
+ int16_t *qcoeff_ptr = d->qcoeff;
+ int16_t *dqcoeff_ptr = d->dqcoeff;
+ int16_t *dequant_ptr = d->dequant;
+ int16_t zbin_oq_value = b->zbin_extra;
+
+ *d->eob = exact_regular_quantize_b_msa(
+ zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..fb83f07bd2
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void temporal_filter_apply_16size_msa(
+ uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
+ int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
+ uint32_t row;
+ v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
+ v16u8 frame_l, frame_h;
+ v16i8 zero = { 0 };
+ v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+ v8i16 diff0, diff1, cnt0, cnt1;
+ v4i32 const3, const16, filter_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+ v4i32 acc0, acc1, acc2, acc3;
+
+ filter_wt = __msa_fill_w(filter_wt_in);
+ strength = __msa_fill_w(strength_in);
+ const3 = __msa_ldi_w(3);
+ const16 = __msa_ldi_w(16);
+
+ for (row = 8; row--;) {
+ frame1_0_b = LD_SB(frame1_ptr);
+ frame2_0_b = LD_SB(frame2_ptr);
+ frame1_ptr += stride;
+ frame2_ptr += 16;
+ frame1_1_b = LD_SB(frame1_ptr);
+ frame2_1_b = LD_SB(frame2_ptr);
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+ ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
+ HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+ diff0_r = (mod0_w < const16);
+ diff0_l = (mod1_w < const16);
+ diff1_r = (mod2_w < const16);
+ diff1_l = (mod3_w < const16);
+ SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+ MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+ filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+ ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
+ UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+ UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+ MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+ acc += 16;
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+ ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
+ HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+ diff0_r = (mod0_w < const16);
+ diff0_l = (mod1_w < const16);
+ diff1_r = (mod2_w < const16);
+ diff1_l = (mod3_w < const16);
+ SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+ MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+ filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
+ UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+ UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+ MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+ acc += 16;
+ frame1_ptr += stride;
+ frame2_ptr += 16;
+ }
+}
+
+static void temporal_filter_apply_8size_msa(
+ uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
+ int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
+ uint32_t row;
+ uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
+ v16i8 frame1 = { 0 };
+ v16i8 frame2 = { 0 };
+ v16i8 frame3 = { 0 };
+ v16i8 frame4 = { 0 };
+ v16u8 frame_l, frame_h;
+ v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+ v8i16 diff0, diff1, cnt0, cnt1;
+ v4i32 const3, const16;
+ v4i32 filter_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+ v4i32 acc0, acc1, acc2, acc3;
+
+ filter_wt = __msa_fill_w(filter_wt_in);
+ strength = __msa_fill_w(strength_in);
+ const3 = __msa_ldi_w(3);
+ const16 = __msa_ldi_w(16);
+
+ for (row = 2; row--;) {
+ LD2(frame1_ptr, stride, f0, f1);
+ frame1_ptr += (2 * stride);
+ LD2(frame2_ptr, 8, f2, f3);
+ frame2_ptr += 16;
+ LD2(frame1_ptr, stride, f4, f5);
+ frame1_ptr += (2 * stride);
+ LD2(frame2_ptr, 8, f6, f7);
+ frame2_ptr += 16;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+ INSERT_D2_SB(f0, f1, frame1);
+ INSERT_D2_SB(f2, f3, frame2);
+ INSERT_D2_SB(f4, f5, frame3);
+ INSERT_D2_SB(f6, f7, frame4);
+ ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
+ HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+ diff0_r = (mod0_w < const16);
+ diff0_l = (mod1_w < const16);
+ diff1_r = (mod2_w < const16);
+ diff1_l = (mod3_w < const16);
+ SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+ MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+ filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
+ UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+ UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+ MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+ acc += 16;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+ ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
+ HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+ diff0_r = (mod0_w < const16);
+ diff0_l = (mod1_w < const16);
+ diff1_r = (mod2_w < const16);
+ diff1_l = (mod3_w < const16);
+ SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+ MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+ filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
+ UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+ UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+ MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+ acc += 16;
+ }
+}
+
+void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
+ uint8_t *frame2, uint32_t block_size,
+ int32_t strength, int32_t filter_weight,
+ uint32_t *accumulator, uint16_t *count) {
+ if (8 == block_size) {
+ temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
+ filter_weight, accumulator, count);
+ } else if (16 == block_size) {
+ temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
+ filter_weight, accumulator, count);
+ } else {
+ uint32_t i, j, k;
+ int32_t modifier;
+ int32_t byte = 0;
+ const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_size; ++i) {
+ for (j = 0; j < block_size; ++j, ++k) {
+ int src_byte = frame1[byte];
+ int pixel_value = *frame2++;
+
+ modifier = src_byte - pixel_value;
+ modifier *= modifier;
+ modifier *= 3;
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16) modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_size;
+ }
+ }
+}