1 files changed, 284 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..fb83f07bd2
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void temporal_filter_apply_16size_msa(
+    uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
+    int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
+  v16u8 frame_l, frame_h;
+  v16i8 zero = { 0 };
+  v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+  v8i16 diff0, diff1, cnt0, cnt1;
+  v4i32 const3, const16, filter_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+  v4i32 acc0, acc1, acc2, acc3;
+
+  filter_wt = __msa_fill_w(filter_wt_in);
+  strength = __msa_fill_w(strength_in);
+  const3 = __msa_ldi_w(3);
+  const16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    frame1_0_b = LD_SB(frame1_ptr);
+    frame2_0_b = LD_SB(frame2_ptr);
+    frame1_ptr += stride;
+    frame2_ptr += 16;
+    frame1_1_b = LD_SB(frame1_ptr);
+    frame2_1_b = LD_SB(frame2_ptr);
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+    ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
+    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+    diff0_r = (mod0_w < const16);
+    diff0_l = (mod1_w < const16);
+    diff1_r = (mod2_w < const16);
+    diff1_l = (mod3_w < const16);
+    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
+    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+    acc += 16;
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+    ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
+    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+    diff0_r = (mod0_w < const16);
+    diff0_l = (mod1_w < const16);
+    diff1_r = (mod2_w < const16);
+    diff1_l = (mod3_w < const16);
+    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
+    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+    acc += 16;
+    frame1_ptr += stride;
+    frame2_ptr += 16;
+  }
+}
+
+static void temporal_filter_apply_8size_msa(
+    uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
+    int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
+  v16i8 frame1 = { 0 };
+  v16i8 frame2 = { 0 };
+  v16i8 frame3 = { 0 };
+  v16i8 frame4 = { 0 };
+  v16u8 frame_l, frame_h;
+  v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+  v8i16 diff0, diff1, cnt0, cnt1;
+  v4i32 const3, const16;
+  v4i32 filter_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+  v4i32 acc0, acc1, acc2, acc3;
+
+  filter_wt = __msa_fill_w(filter_wt_in);
+  strength = __msa_fill_w(strength_in);
+  const3 = __msa_ldi_w(3);
+  const16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD2(frame1_ptr, stride, f0, f1);
+    frame1_ptr += (2 * stride);
+    LD2(frame2_ptr, 8, f2, f3);
+    frame2_ptr += 16;
+    LD2(frame1_ptr, stride, f4, f5);
+    frame1_ptr += (2 * stride);
+    LD2(frame2_ptr, 8, f6, f7);
+    frame2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+    INSERT_D2_SB(f0, f1, frame1);
+    INSERT_D2_SB(f2, f3, frame2);
+    INSERT_D2_SB(f4, f5, frame3);
+    INSERT_D2_SB(f6, f7, frame4);
+    ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
+    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+    diff0_r = (mod0_w < const16);
+    diff0_l = (mod1_w < const16);
+    diff1_r = (mod2_w < const16);
+    diff1_l = (mod3_w < const16);
+    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
+    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+    acc += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+    ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
+    HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+    diff0_r = (mod0_w < const16);
+    diff0_l = (mod1_w < const16);
+    diff1_r = (mod2_w < const16);
+    diff1_l = (mod3_w < const16);
+    SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+    MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+         filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
+    UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+    UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+    MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+    acc += 16;
+  }
+}
+
+void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
+                                   uint8_t *frame2, uint32_t block_size,
+                                   int32_t strength, int32_t filter_weight,
+                                   uint32_t *accumulator, uint16_t *count) {
+  if (8 == block_size) {
+    temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
+                                    filter_weight, accumulator, count);
+  } else if (16 == block_size) {
+    temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
+                                     filter_weight, accumulator, count);
+  } else {
+    uint32_t i, j, k;
+    int32_t modifier;
+    int32_t byte = 0;
+    const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+    for (i = 0, k = 0; i < block_size; ++i) {
+      for (j = 0; j < block_size; ++j, ++k) {
+        int src_byte = frame1[byte];
+        int pixel_value = *frame2++;
+
+        modifier = src_byte - pixel_value;
+        modifier *= modifier;
+        modifier *= 3;
+        modifier += rounding;
+        modifier >>= strength;
+
+        if (modifier > 16) modifier = 16;
+
+        modifier = 16 - modifier;
+        modifier *= filter_weight;
+
+        count[k] += modifier;
+        accumulator[k] += modifier * pixel_value;
+
+        byte++;
+      }
+
+      byte += stride - block_size;
+    }
+  }
+}