1 files changed, 271 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
new file mode 100644
index 0000000000..be9614a358
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
+  int distortion;
+
+  const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+  const int16x8_t a1 =
+      unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
+  const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+  const int16x8_t b1 =
+      unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
+  const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
+
+  vec_ste(d, 0, &distortion);
+
+  return distortion;
+}
+
+// TODO(lu_zero): Unroll
+uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
+  unsigned int i, sum = 0;
+  int32x4_t s = vec_splat_s32(0);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
+    s = vec_msum(v, v, s);
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)s, 0, &sum);
+
+  return sum;
+}
+
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
+  assert(((intptr_t)pred & 0xf) == 0);
+  if (width >= 16) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
+        vec_vsx_st(v, j, comp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    // Process 2 lines at time
+    for (i = 0; i < height / 2; ++i) {
+      const uint8x16_t r0 = vec_vsx_ld(0, ref);
+      const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
+      const uint8x16_t r = xxpermdi(r0, r1, 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 2;
+      pred += 16;       // width * 2;
+      ref += ref_stride * 2;
+    }
+  } else {
+    assert(width == 4);
+    // process 4 lines at time
+    for (i = 0; i < height / 4; ++i) {
+      const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
+      const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
+      const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
+      const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
+      const uint8x16_t r =
+          (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 4;
+      pred += 16;       // width * 4;
+      ref += ref_stride * 4;
+    }
+  }
+}
+
+static INLINE void variance_inner_32(const uint8_t *src_ptr,
+                                     const uint8_t *ref_ptr,
+                                     int32x4_t *sum_squared, int32x4_t *sum) {
+  int32x4_t s = *sum;
+  int32x4_t ss = *sum_squared;
+
+  const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
+  const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
+  const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
+  const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
+
+  const int16x8_t a0 = unpack_to_s16_h(va0);
+  const int16x8_t b0 = unpack_to_s16_h(vb0);
+  const int16x8_t a1 = unpack_to_s16_l(va0);
+  const int16x8_t b1 = unpack_to_s16_l(vb0);
+  const int16x8_t a2 = unpack_to_s16_h(va1);
+  const int16x8_t b2 = unpack_to_s16_h(vb1);
+  const int16x8_t a3 = unpack_to_s16_l(va1);
+  const int16x8_t b3 = unpack_to_s16_l(vb1);
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int16x8_t d2 = vec_sub(a2, b2);
+  const int16x8_t d3 = vec_sub(a3, b3);
+
+  s = vec_sum4s(d0, s);
+  ss = vec_msum(d0, d0, ss);
+  s = vec_sum4s(d1, s);
+  ss = vec_msum(d1, d1, ss);
+  s = vec_sum4s(d2, s);
+  ss = vec_msum(d2, d2, ss);
+  s = vec_sum4s(d3, s);
+  ss = vec_msum(d3, d3, ss);
+  *sum = s;
+  *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr, int ref_stride, int w,
+                            int h, uint32_t *sse, int *sum) {
+  int i;
+
+  int32x4_t s = vec_splat_s32(0);
+  int32x4_t ss = vec_splat_s32(0);
+
+  switch (w) {
+    case 4:
+      for (i = 0; i < h / 2; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+        const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+        const int16x8_t d = vec_sub(a0, b0);
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride * 2;
+        ref_ptr += ref_stride * 2;
+      }
+      break;
+    case 8:
+      for (i = 0; i < h; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
+        const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
+        const int16x8_t d = vec_sub(a0, b0);
+
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 16:
+      for (i = 0; i < h; ++i) {
+        const uint8x16_t va = vec_vsx_ld(0, src_ptr);
+        const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
+        const int16x8_t a0 = unpack_to_s16_h(va);
+        const int16x8_t b0 = unpack_to_s16_h(vb);
+        const int16x8_t a1 = unpack_to_s16_l(va);
+        const int16x8_t b1 = unpack_to_s16_l(vb);
+        const int16x8_t d0 = vec_sub(a0, b0);
+        const int16x8_t d1 = vec_sub(a1, b1);
+
+        s = vec_sum4s(d0, s);
+        ss = vec_msum(d0, d0, ss);
+        s = vec_sum4s(d1, s);
+        ss = vec_msum(d1, d1, ss);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 32:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 64:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste(s, 0, sum);
+
+  ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                                    \
+  void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
+                                 const uint8_t *ref_ptr, int ref_stride, \
+                                 uint32_t *sse, int *sum) {              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum);  \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                                         \
+  uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                  const uint8_t *ref_ptr, int ref_stride, \
+                                  uint32_t *sse) {                        \
+    int sum;                                                              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);  \
+    return *sse;                                                          \
+  }
+
+#define VAR(W, H)                                                              \
+  uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                       const uint8_t *ref_ptr, int ref_stride, \
+                                       uint32_t *sse) {                        \
+    int sum;                                                                   \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H)));              \
+  }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)