1 files changed, 241 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/arm/highbd_reconintra_neon.c b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
new file mode 100644
index 0000000000..170491b504
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+
+#define MAX_UPSAMPLE_SZ 16
+
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+  assert(sz >= 0 && sz <= 129);
+
+  DECLARE_ALIGNED(16, static const uint16_t,
+                  idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  const uint16x8_t index = vld1q_u16(idx);
+
+  uint16_t edge[160];  // Max value of sz + enough padding for vector accesses.
+  memcpy(edge + 1, p, sz * sizeof(*p));
+
+  // Populate extra space appropriately.
+  edge[0] = edge[1];
+  edge[sz + 1] = edge[sz];
+  edge[sz + 2] = edge[sz];
+
+  // Don't overwrite first pixel.
+  uint16_t *dst = p + 1;
+  sz--;
+
+  if (strength == 1) {  // Filter: {4, 8, 4}.
+    const uint16_t *src = edge + 1;
+
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      // Make use of the identity:
+      // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+      uint16x8_t t0 = vaddq_u16(s0, s2);
+      uint16x8_t t1 = vaddq_u16(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      // Make use of the identity:
+      // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+      uint16x8_t t0 = vaddq_u16(s0, s2);
+      uint16x8_t t1 = vaddq_u16(s1, s1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  } else if (strength == 2) {  // Filter: {5, 6, 5}.
+    const uint16_t *src = edge + 1;
+
+    const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6),
+                                    vdupq_n_u16(5) } };
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+      accum = vmlaq_u16(accum, s1, filter.val[1]);
+      accum = vmlaq_u16(accum, s2, filter.val[2]);
+      uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+
+      uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+      accum = vmlaq_u16(accum, s1, filter.val[1]);
+      accum = vmlaq_u16(accum, s2, filter.val[2]);
+      uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  } else {  // Filter {2, 4, 4, 4, 2}.
+    const uint16_t *src = edge;
+
+    while (sz >= 8) {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+      uint16x8_t s4 = vld1q_u16(src + 4);
+
+      // Make use of the identity:
+      // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+      uint16x8_t t0 = vaddq_u16(s0, s4);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vaddq_u16(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+      vst1q_u16(dst, res);
+
+      src += 8;
+      dst += 8;
+      sz -= 8;
+    }
+
+    if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+      uint16x8_t s4 = vld1q_u16(src + 4);
+
+      // Make use of the identity:
+      // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+      uint16x8_t t0 = vaddq_u16(s0, s4);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vaddq_u16(t1, s3);
+      t1 = vaddq_u16(t1, t1);
+      uint16x8_t sum = vaddq_u16(t0, t1);
+      uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+      // Mask off out-of-bounds indices.
+      uint16x8_t current_dst = vld1q_u16(dst);
+      uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+      res = vbslq_u16(mask, res, current_dst);
+
+      vst1q_u16(dst, res);
+    }
+  }
+}
+
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) {
+  if (!sz) return;
+
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint16_t edge[MAX_UPSAMPLE_SZ + 3];
+  const uint16_t *src = edge;
+
+  // Copy p[-1..(sz-1)] and pad out both ends.
+  edge[0] = p[-1];
+  edge[1] = p[-1];
+  memcpy(edge + 2, p, sz * 2);
+  edge[sz + 2] = p[sz - 1];
+  p[-2] = p[-1];
+
+  uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1);
+
+  uint16_t *dst = p - 1;
+
+  if (bd == 12) {
+    do {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+
+      uint16x8_t t0 = vaddq_u16(s1, s2);
+      uint16x8_t t1 = vaddq_u16(s0, s3);
+      uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9);
+      acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1)));
+      uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9);
+      acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1)));
+
+      uint16x8x2_t res;
+      res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4));
+      // Clamp pixel values at bitdepth maximum.
+      res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+      res.val[1] = s2;
+
+      vst2q_u16(dst, res);
+
+      src += 8;
+      dst += 16;
+      sz -= 8;
+    } while (sz > 0);
+  } else {  // Bit depth is 8 or 10.
+    do {
+      uint16x8_t s0 = vld1q_u16(src);
+      uint16x8_t s1 = vld1q_u16(src + 1);
+      uint16x8_t s2 = vld1q_u16(src + 2);
+      uint16x8_t s3 = vld1q_u16(src + 3);
+
+      uint16x8_t t0 = vaddq_u16(s0, s3);
+      uint16x8_t t1 = vaddq_u16(s1, s2);
+      t1 = vmulq_n_u16(t1, 9);
+      t1 = vqsubq_u16(t1, t0);
+
+      uint16x8x2_t res;
+      res.val[0] = vrshrq_n_u16(t1, 4);
+      // Clamp pixel values at bitdepth maximum.
+      res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+      res.val[1] = s2;
+
+      vst2q_u16(dst, res);
+
+      src += 8;
+      dst += 16;
+      sz -= 8;
+    } while (sz > 0);
+  }
+}