3 files changed, 129 insertions, 23 deletions
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
index 63aad0b785..52803a9838 100644
--- a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
@@ -14,7 +14,7 @@
 #include "config/aom_config.h"
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
 #include "aom_dsp/arm/mem_neon.h"
 
 int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
index 5a52e701a2..919521fec7 100644
--- a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -23,7 +23,15 @@
 #define SSE_STRIDE (BW + 4)
 
 // clang-format off
+// Table used to pad the first and last columns and apply the sliding window.
+DECLARE_ALIGNED(16, static const uint8_t, kLoadPad[4][16]) = {
+  {   2,   2, 2, 3, 4, 255, 255, 255, 255,   2,   2, 3, 4, 5, 255, 255 },
+  { 255, 255, 2, 3, 4,   5,   6, 255, 255, 255, 255, 3, 4, 5,   6,   7 },
+  {   0,   1, 2, 3, 4, 255, 255, 255, 255,   1,   2, 3, 4, 5, 255, 255 },
+  { 255, 255, 2, 3, 4,   5,   5, 255, 255, 255, 255, 3, 4, 5,   5,   5 }
+};
 
+// For columns that don't need to be padded it's just a simple mask.
 DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
   0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
@@ -56,22 +64,6 @@ static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
   } while (++i < block_height);
 }
 
-static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
-                                      const uint32_t block_width) {
-  uint8x8_t s = vld1_u8(src);
-
-  if (col == 0) {
-    const uint8_t lane2 = vget_lane_u8(s, 2);
-    s = vset_lane_u8(lane2, s, 0);
-    s = vset_lane_u8(lane2, s, 1);
-  } else if (col >= block_width - 4) {
-    const uint8_t lane5 = vget_lane_u8(s, 5);
-    s = vset_lane_u8(lane5, s, 6);
-    s = vset_lane_u8(lane5, s, 7);
-  }
-  return vcombine_u8(s, s);
-}
-
 static void apply_temporal_filter(
     const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
     const uint32_t block_height, const int *subblock_mses,
@@ -84,6 +76,10 @@ static void apply_temporal_filter(
 
   uint32_t acc_5x5_neon[BH][BW];
   const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+  const uint8x16_t pad_tbl0 = vld1q_u8(kLoadPad[0]);
+  const uint8x16_t pad_tbl1 = vld1q_u8(kLoadPad[1]);
+  const uint8x16_t pad_tbl2 = vld1q_u8(kLoadPad[2]);
+  const uint8x16_t pad_tbl3 = vld1q_u8(kLoadPad[3]);
 
   // Traverse 4 columns at a time - first and last two columns need padding.
   for (uint32_t col = 0; col < block_width; col += 4) {
@@ -92,9 +88,18 @@ static void apply_temporal_filter(
 
     // Load, pad (for first and last two columns) and mask 3 rows from the top.
     for (int i = 2; i < 5; i++) {
-      const uint8x16_t s = load_and_pad(src, col, block_width);
-      vsrc[i][0] = vandq_u8(s, vmask.val[0]);
-      vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+      uint8x8_t s = vld1_u8(src);
+      uint8x16_t s_dup = vcombine_u8(s, s);
+      if (col == 0) {
+        vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl0);
+        vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl1);
+      } else if (col >= block_width - 4) {
+        vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl2);
+        vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl3);
+      } else {
+        vsrc[i][0] = vandq_u8(s_dup, vmask.val[0]);
+        vsrc[i][1] = vandq_u8(s_dup, vmask.val[1]);
+      }
       src += SSE_STRIDE;
     }
 
@@ -130,9 +135,18 @@ static void apply_temporal_filter(
 
       if (row <= block_height - 4) {
         // Load next row into the bottom of the sliding window.
-        uint8x16_t s = load_and_pad(src, col, block_width);
-        vsrc[4][0] = vandq_u8(s, vmask.val[0]);
-        vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+        uint8x8_t s = vld1_u8(src);
+        uint8x16_t s_dup = vcombine_u8(s, s);
+        if (col == 0) {
+          vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl0);
+          vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl1);
+        } else if (col >= block_width - 4) {
+          vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl2);
+          vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl3);
+        } else {
+          vsrc[4][0] = vandq_u8(s_dup, vmask.val[0]);
+          vsrc[4][1] = vandq_u8(s_dup, vmask.val[1]);
+        }
         src += SSE_STRIDE;
       } else {
         // Pad the bottom 2 rows.
diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c
new file mode 100644
index 0000000000..521601a3f3
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/reconinter.h"
+
+uint64_t av1_wedge_sse_from_residuals_sve(const int16_t *r1, const int16_t *d,
+                                          const uint8_t *m, int N) {
+  assert(N % 64 == 0);
+
+  // Predicate pattern with first 8 elements true.
+  const svbool_t pattern = svptrue_pat_b16(SV_VL8);
+  int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  int i = 0;
+  do {
+    int32x4_t sum[4];
+    int16x8_t sum_s16[2];
+
+    const int16x8_t r1_l = vld1q_s16(r1 + i);
+    const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
+    const int16x8_t d_l = vld1q_s16(d + i);
+    const int16x8_t d_h = vld1q_s16(d + i + 8);
+
+    // Use a zero-extending load to widen the vector elements.
+    const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m + i));
+    const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + i + 8));
+
+    sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
+    sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
+    sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
+
+    sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
+    sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
+    sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
+    sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
+
+    sum_s16[0] = vcombine_s16(vqmovn_s32(sum[0]), vqmovn_s32(sum[1]));
+    sum_s16[1] = vcombine_s16(vqmovn_s32(sum[2]), vqmovn_s32(sum[3]));
+
+    sse[0] = aom_sdotq_s16(sse[0], sum_s16[0], sum_s16[0]);
+    sse[1] = aom_sdotq_s16(sse[1], sum_s16[1], sum_s16[1]);
+
+    i += 16;
+  } while (i < N);
+
+  const uint64_t csse =
+      (uint64_t)horizontal_add_s64x2(vaddq_s64(sse[0], sse[1]));
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+int8_t av1_wedge_sign_from_residuals_sve(const int16_t *ds, const uint8_t *m,
+                                         int N, int64_t limit) {
+  assert(N % 16 == 0);
+
+  // Predicate pattern with first 8 elements true.
+  svbool_t pattern = svptrue_pat_b16(SV_VL8);
+  int64x2_t acc_l = vdupq_n_s64(0);
+  int64x2_t acc_h = vdupq_n_s64(0);
+
+  do {
+    const int16x8_t ds_l = vld1q_s16(ds);
+    const int16x8_t ds_h = vld1q_s16(ds + 8);
+
+    // Use a zero-extending load to widen the vector elements.
+    const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m));
+    const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + 8));
+
+    acc_l = aom_sdotq_s16(acc_l, ds_l, m_l);
+    acc_h = aom_sdotq_s16(acc_h, ds_h, m_h);
+
+    ds += 16;
+    m += 16;
+    N -= 16;
+  } while (N != 0);
+
+  const int64x2_t sum = vaddq_s64(acc_l, acc_h);
+  return horizontal_add_s64x2(sum) > limit;
+}