summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c190
1 files changed, 190 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..531309b025
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width >= 8) {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src0 + j);
+ const uint16x8_t s1 = vld1q_u16(src1 + j);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(comp_pred + j, blend_u16);
+
+ j += 8;
+ } while (j < width);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+
+ do {
+ const uint16x4_t s0 = vld1_u16(src0);
+ const uint16x4_t s1 = vld1_u16(src1);
+ const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ vst1_u16(comp_pred, blend_u16);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 4;
+ } while (--height != 0);
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ const uint16x4_t avg = dist_wtd_avg_u16x4(
+ r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ }
+}