1 files changed, 190 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..531309b025
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int src_stride0 = invert_mask ? width : ref_stride;
+  const int src_stride1 = invert_mask ? ref_stride : width;
+
+  if (width >= 8) {
+    do {
+      int j = 0;
+
+      do {
+        const uint16x8_t s0 = vld1q_u16(src0 + j);
+        const uint16x8_t s1 = vld1q_u16(src1 + j);
+        const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+        uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(comp_pred + j, blend_u16);
+
+        j += 8;
+      } while (j < width);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += width;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+
+    do {
+      const uint16x4_t s0 = vld1_u16(src0);
+      const uint16x4_t s1 = vld1_u16(src1);
+      const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+      uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      vst1_u16(comp_pred, blend_u16);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += 4;
+    } while (--height != 0);
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        const uint16x8_t avg =
+            dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      const uint16x8_t avg =
+          dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      const uint16x4_t avg = dist_wtd_avg_u16x4(
+          r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  }
+}