3 files changed, 431 insertions, 57 deletions
diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c
new file mode 100644
index 0000000000..7a14f21846
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <arm_neon_sve_bridge.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+#include "highbd_warp_plane_neon.h"
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd,
+                                                        int sx, int alpha) {
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+  res = vrshlq_s32(res, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd,
+                                                        int sx, int alpha) {
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 4);
+  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 5);
+  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 6);
+  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 7);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+  res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+  res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+  res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd,
+                                                        int sx) {
+  int16x8_t f = load_filters_1(sx);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+  res = vrshlq_s32(res, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd,
+                                                        int sx) {
+  int16x8_t f = load_filters_1(sx);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 4);
+  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 5);
+  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 6);
+  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 7);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+  res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+  res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+  res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  // No benefit to using SDOT here, the cost of rearrangement is too high.
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+  return m0123;
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  // No benefit to using SDOT here, the cost of rearrangement is too high.
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3);
+  return (int32x4x2_t){ { m0123, m4567 } };
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy,
+                                               int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]),
+      vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]),
+      vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3);
+
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy,
+                                                 int gamma) {
+  int16x8_t s0 = tmp[0];
+  int16x8_t s1 = tmp[1];
+  int16x8_t s2 = tmp[2];
+  int16x8_t s3 = tmp[3];
+  int16x8_t s4 = tmp[4];
+  int16x8_t s5 = tmp[5];
+  int16x8_t s6 = tmp[6];
+  int16x8_t s7 = tmp[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  int32x4x2_t ret;
+  ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  return ret;
+}
+
+void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref,
+                                int width, int height, int stride,
+                                uint16_t *pred, int p_col, int p_row,
+                                int p_width, int p_height, int p_stride,
+                                int subsampling_x, int subsampling_y, int bd,
+                                ConvolveParams *conv_params, int16_t alpha,
+                                int16_t beta, int16_t gamma, int16_t delta) {
+  highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+                            p_width, p_height, p_stride, subsampling_x,
+                            subsampling_y, bd, conv_params, alpha, beta, gamma,
+                            delta);
+}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index c5fe389ba1..ef999fbba2 100644
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -541,7 +541,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index 20a1e12476..f68af18cb1 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -1368,7 +1368,7 @@ void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) {
   }
 }
 
-static void highbd_build_intra_predictors(
+static void highbd_build_directional_and_filter_intra_predictors(
     const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
     PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
     TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
@@ -1376,7 +1376,7 @@ static void highbd_build_intra_predictors(
     int bit_depth) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
   DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint16_t *const above_row = above_data + 16;
@@ -1390,7 +1390,8 @@ static void highbd_build_intra_predictors(
   const uint16_t *left_ref = ref - 1;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (bit_depth - 8);
+  assert(use_filter_intra || is_dr_mode);
+  const int base = 128 << (bit_depth - 8);
   // The left_data, above_data buffers must be zeroed to fix some intermittent
   // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
   // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
@@ -1492,49 +1493,124 @@ static void highbd_build_intra_predictors(
     return;
   }
 
-  if (is_dr_mode) {
-    int upsample_above = 0;
-    int upsample_left = 0;
-    if (!disable_edge_filter) {
-      const int need_right = p_angle < 90;
-      const int need_bottom = p_angle > 180;
-      if (p_angle != 90 && p_angle != 180) {
-        const int ab_le = need_above_left ? 1 : 0;
-        if (need_above && need_left && (txwpx + txhpx >= 24)) {
-          highbd_filter_intra_edge_corner(above_row, left_col);
-        }
-        if (need_above && n_top_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
-          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
-          av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
-        }
-        if (need_left && n_left_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
-          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
-          av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
-        }
+  assert(is_dr_mode);
+  int upsample_above = 0;
+  int upsample_left = 0;
+  if (!disable_edge_filter) {
+    const int need_right = p_angle < 90;
+    const int need_bottom = p_angle > 180;
+    if (p_angle != 90 && p_angle != 180) {
+      const int ab_le = need_above_left ? 1 : 0;
+      if (need_above && need_left && (txwpx + txhpx >= 24)) {
+        highbd_filter_intra_edge_corner(above_row, left_col);
       }
-      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
-                                                   intra_edge_filter_type);
-      if (need_above && upsample_above) {
-        const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+      if (need_above && n_top_px > 0) {
+        const int strength = intra_edge_filter_strength(
+            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
+        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+        av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
       }
-      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
-                                                  intra_edge_filter_type);
-      if (need_left && upsample_left) {
-        const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+      if (need_left && n_left_px > 0) {
+        const int strength = intra_edge_filter_strength(
+            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
+        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+        av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
       }
     }
-    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, bit_depth);
+    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                 intra_edge_filter_type);
+    if (need_above && upsample_above) {
+      const int n_px = txwpx + (need_right ? txhpx : 0);
+      av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+    }
+    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                intra_edge_filter_type);
+    if (need_left && upsample_left) {
+      const int n_px = txhpx + (need_bottom ? txwpx : 0);
+      av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+    }
+  }
+  highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                      upsample_above, upsample_left, p_angle, bit_depth);
+}
+
+// For HBD encode/decode, this function generates the pred data of a given
+// block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H,
+// SMOOTH_V and PAETH).
+static void highbd_build_non_directional_intra_predictors(
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px,
+    int bit_depth) {
+  int i = 0;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+  const uint16_t *left_ref = ref - 1;
+  const int base = 128 << (bit_depth - 8);
+
+  assert(n_top_px >= 0);
+  assert(n_left_px >= 0);
+  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val = 0;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : base + 1;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      aom_memset16(dst, val, txwpx);
+      dst += dst_stride;
+    }
     return;
   }
 
-  // predict
+  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  uint16_t *const above_row = above_data + 16;
+  uint16_t *const left_col = left_data + 16;
+
+  if (need_left) {
+    aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+    if (n_left_px > 0) {
+      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i);
+    } else if (n_top_px > 0) {
+      aom_memset16(left_col, above_ref[0], txhpx);
+    }
+  }
+
+  if (need_above) {
+    aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i));
+    } else if (n_left_px > 0) {
+      aom_memset16(above_row, left_ref[0], txwpx);
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = base;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
   if (mode == DC_PRED) {
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
         dst, dst_stride, above_row, left_col, bit_depth);
@@ -1660,12 +1736,19 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
   // separate function build_non_directional_intra_predictors() is introduced
   // for these modes to avoid redundant computations while generating pred data.
 
-  // TODO(aomedia:3532): Enable this refactoring for high bd path as well.
-  if (!is_hbd && !use_filter_intra && !is_dr_mode) {
-    build_non_directional_intra_predictors(
-        ref, ref_stride, dst, dst_stride, mode, tx_size,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_left ? AOMMIN(txhpx, yd + txhpx) : 0);
+  const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0;
+  const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0;
+  if (!use_filter_intra && !is_dr_mode) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      highbd_build_non_directional_intra_predictors(
+          ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px,
+          xd->bd);
+      return;
+    }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride,
+                                           mode, tx_size, n_top_px, n_left_px);
     return;
   }
 
@@ -1717,25 +1800,23 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
 
   const int disable_edge_filter = !enable_intra_edge_filter;
   const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
+  const int n_topright_px =
+      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right;
+  const int n_bottomleft_px =
+      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left;
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_hbd) {
-    highbd_build_intra_predictors(
+    highbd_build_directional_and_filter_intra_predictors(
         ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
-        tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
-        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
-        intra_edge_filter_type, xd->bd);
+        tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
+        n_bottomleft_px, intra_edge_filter_type, xd->bd);
     return;
   }
 #endif
   build_directional_and_filter_intra_predictors(
       ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
-      tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
-      have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
-      intra_edge_filter_type);
+      tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
+      n_bottomleft_px, intra_edge_filter_type);
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,