From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:14:29 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 third_party/aom/AUTHORS                            |   1 +
 third_party/aom/CHANGELOG                          |  36 +++
 third_party/aom/CMakeLists.txt                     |   2 +-
 third_party/aom/aom/src/aom_codec.c                |   1 +
 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl       |  21 +-
 third_party/aom/aom_dsp/arm/highbd_variance_sve.c  |   9 -
 third_party/aom/aom_dsp/arm/intrapred_neon.c       |   1 +
 .../aom/aom_dsp/flow_estimation/arm/disflow_neon.c |   2 +-
 .../aom/aom_dsp/flow_estimation/corner_match.c     |   2 +-
 third_party/aom/aom_dsp/flow_estimation/disflow.c  |   4 +-
 .../aom/aom_dsp/flow_estimation/x86/disflow_sse4.c |   2 +-
 third_party/aom/av1/av1.cmake                      |   1 +
 .../aom/av1/common/arm/highbd_warp_plane_sve.c     | 293 +++++++++++++++++++++
 third_party/aom/av1/common/av1_rtcd_defs.pl        |   2 +-
 third_party/aom/av1/common/reconintra.c            | 193 ++++++++++----
 third_party/aom/av1/encoder/encoder.c              |  13 +-
 third_party/aom/av1/encoder/encoder.h              |   4 +-
 third_party/aom/av1/encoder/mcomp.c                |   1 -
 third_party/aom/av1/encoder/speed_features.c       |   8 +
 third_party/aom/common/tools_common.c              |  20 +-
 third_party/aom/test/av1_c_vs_simd_encode.sh       |  38 +--
 third_party/aom/test/dr_prediction_test.cc         |  68 +++--
 third_party/aom/test/encode_api_test.cc            |  48 +++-
 third_party/aom/test/resize_test.cc                |  69 ++++-
 third_party/aom/test/variance_test.cc              |   6 +-
 third_party/aom/test/warp_filter_test.cc           |   6 +
 26 files changed, 694 insertions(+), 157 deletions(-)
 create mode 100644 third_party/aom/av1/common/arm/highbd_warp_plane_sve.c

(limited to 'third_party/aom')

diff --git a/third_party/aom/AUTHORS b/third_party/aom/AUTHORS
index ade7a1a5d0..509c0d1c9d 100644
--- a/third_party/aom/AUTHORS
+++ b/third_party/aom/AUTHORS
@@ -235,6 +235,7 @@ Ronald S. Bultje <rsbultje@gmail.com>
 Rostislav Pehlivanov <rpehlivanov@mozilla.com>
 Ruiling Song <ruiling.song@intel.com>
 Rui Ueyama <ruiu@google.com>
+Ruoyu Zhong <zhongruoyu@outlook.com>
 Rupert Swarbrick <rupert.swarbrick@argondesign.com>
 Ryan Lei <ryanlei@fb.com>
 Ryan Overbeck <rover@google.com>
diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG
index b243837d3c..b5c1afbba2 100644
--- a/third_party/aom/CHANGELOG
+++ b/third_party/aom/CHANGELOG
@@ -1,3 +1,39 @@
+2024-01-17 v3.8.1
+  This release includes several bug fixes. This release is ABI
+  compatible with the last release. See
+  https://aomedia.googlesource.com/aom/+log/v3.8.0..v3.8.1 for all the
+  commits in this release.
+
+  - Bug Fixes
+    * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1'
+      failed.
+    * aomedia:3526: alloc_compressor_data() is called during every
+      aom_codec_control() call on the encoder.
+    * aomedia:3527: aom/av1/encoder/mcomp.c:1810: av1_full_pixel_search:
+      Assertion `ms_params->ms_buffers.ref->width ==
+      ms_params->ms_buffers.src->width' failed.
+    * aomedia:3534: libaom encoder crashed by AOM_USAGE_ALL_INTRA and
+      AOM_EFLAG_NO_REF_LAST flags.
+    * b/310455204: Recreate workers if necessary.
+    * b/310548198: Update frame size in actual encoding.
+    * b/314858909: Do not use adaptive error estimate.
+    * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later.
+
+2024-01-18 v3.7.2
+  This release includes three bug fixes. This release is ABI compatible
+  with the last release. See
+  https://aomedia.googlesource.com/aom/+log/v3.7.1..v3.7.2 for all the
+  commits in this release.
+
+  - Bug Fixes
+    * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1'
+      failed.
+    * aomedia:3526: alloc_compressor_data() is called during every
+      aom_codec_control() call on the encoder. Note that this partially
+      reverts the fix for bug aomedia:3349.
+    * b/310457427 and b/310766628: Only use rec_sse in CBR mode.
+    * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later.
+
 2023-11-30 v3.8.0
   This release includes new codec interfaces, compression efficiency and
   perceptual improvements, speedup and memory optimizations and many bug
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
index 76944e6917..a02b220bdb 100644
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -59,7 +59,7 @@ endif()
 #
 # We set SO_FILE_VERSION = [c-a].a.r
 set(LT_CURRENT 11)
-set(LT_REVISION 0)
+set(LT_REVISION 1)
 set(LT_AGE 8)
 math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
 set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c
index 512fd28196..316cc6fd23 100644
--- a/third_party/aom/aom/src/aom_codec.c
+++ b/third_party/aom/aom/src/aom_codec.c
@@ -170,6 +170,7 @@ void aom_internal_error(struct aom_internal_error_info *info,
 void aom_internal_error_copy(struct aom_internal_error_info *info,
                              const struct aom_internal_error_info *src) {
   assert(info != src);
+  assert(!src->setjmp);
 
   if (!src->has_detail) {
     aom_internal_error(info, src->error_code, NULL);
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
index 4b49605e53..7bb156ac59 100755
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1352,16 +1352,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
       add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-      specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/;
-      specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
-      specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/;
-      specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/;
-    }
+      if ($bd eq 8) {
+        specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon neon_dotprod/;
+        specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/;
+        specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/;
+        specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/;
+      } else {
+        specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/;
+        specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
+        specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/;
+        specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/;
+      }
 
-    specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
-    specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
-    specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
-    specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
+    }
   }
 
   #
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
index d0058bfa90..a2c30a1688 100644
--- a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
@@ -348,15 +348,6 @@ static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
 }
 
 #define HIGHBD_MSE_WXH_SVE(w, h)                                      \
-  uint32_t aom_highbd_8_mse##w##x##h##_sve(                           \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
-      int ref_stride, uint32_t *sse) {                                \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
-    highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse);  \
-    return *sse;                                                      \
-  }                                                                   \
-                                                                      \
   uint32_t aom_highbd_10_mse##w##x##h##_sve(                          \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
       int ref_stride, uint32_t *sse) {                                \
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
index d8dc60c1fe..c3716b3a78 100644
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -11,6 +11,7 @@
 
 #include <arm_neon.h>
 #include <assert.h>
+#include <stdint.h>
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
index ee42be7393..62729133e3 100644
--- a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
+++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -22,7 +22,7 @@
 static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
   // Check that the fractional position is in range.
   //
-  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
   // Mathematically, this implies that 0 <= x < 1. However, in practice it is
   // possible to have x == 1 due to floating point rounding. This is fine,
   // and we still interpolate correctly if we allow x = 1.
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
index cef719b68d..dc7589a8c6 100644
--- a/third_party/aom/aom_dsp/flow_estimation/corner_match.c
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
@@ -224,7 +224,7 @@ bool av1_compute_global_motion_feature_match(
     *mem_alloc_failed = true;
     return false;
   }
-  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+  if (!av1_compute_corner_list(ref_pyramid, ref_corners)) {
     *mem_alloc_failed = true;
     return false;
   }
diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c
index 147a8ab3b3..82b531c729 100644
--- a/third_party/aom/aom_dsp/flow_estimation/disflow.c
+++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c
@@ -25,7 +25,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 // Amount to downsample the flow field by.
-// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate
+// e.g., DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate
 // one flow point for each 4x4 pixel region of the frame
 // Must be a power of 2
 #define DOWNSAMPLE_SHIFT 3
@@ -66,7 +66,7 @@ static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = {
 static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
   // Check that the fractional position is in range.
   //
-  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
   // Mathematically, this implies that 0 <= x < 1. However, in practice it is
   // possible to have x == 1 due to floating point rounding. This is fine,
   // and we still interpolate correctly if we allow x = 1.
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
index d2b04c1973..2c5effd638 100644
--- a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -30,7 +30,7 @@
 static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
   // Check that the fractional position is in range.
   //
-  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Note: x is calculated from, e.g., `u_frac = u - floor(u)`.
   // Mathematically, this implies that 0 <= x < 1. However, in practice it is
   // possible to have x == 1 due to floating point rounding. This is fine,
   // and we still interpolate correctly if we allow x = 1.
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 15577d0c0e..c66a748d40 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -406,6 +406,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM
             "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c")
 
 list(APPEND AOM_AV1_COMMON_INTRIN_SVE
+            "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c"
             "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c
new file mode 100644
index 0000000000..7a14f21846
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <arm_neon_sve_bridge.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+#include "highbd_warp_plane_neon.h"
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd,
+                                                        int sx, int alpha) {
+  int16x8_t f[4];
+  load_filters_4(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+  res = vrshlq_s32(res, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd,
+                                                        int sx, int alpha) {
+  int16x8_t f[8];
+  load_filters_8(f, sx, alpha);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 4);
+  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 5);
+  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 6);
+  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 7);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+  res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+  res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+  res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd,
+                                                        int sx) {
+  int16x8_t f = load_filters_1(sx);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+  res = vrshlq_s32(res, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd,
+                                                        int sx) {
+  int16x8_t f = load_filters_1(sx);
+
+  int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 0);
+  int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 1);
+  int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 2);
+  int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 3);
+  int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 4);
+  int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 5);
+  int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 6);
+  int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+                            vreinterpretq_s16_u16(in.val[1]), 7);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+  res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+  res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+  res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+  return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  // No benefit to using SDOT here, the cost of rearrangement is too high.
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+  return m0123;
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) {
+  const int16x8_t f = load_filters_1(sy);
+  const int16x4_t f0123 = vget_low_s16(f);
+  const int16x4_t f4567 = vget_high_s16(f);
+
+  // No benefit to using SDOT here, the cost of rearrangement is too high.
+  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+  m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+
+  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2);
+  m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3);
+  return (int32x4x2_t){ { m0123, m4567 } };
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy,
+                                               int gamma) {
+  int16x8_t s0, s1, s2, s3;
+  transpose_elems_s16_4x8(
+      vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]),
+      vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]),
+      vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3);
+
+  int16x8_t f[4];
+  load_filters_4(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy,
+                                                 int gamma) {
+  int16x8_t s0 = tmp[0];
+  int16x8_t s1 = tmp[1];
+  int16x8_t s2 = tmp[2];
+  int16x8_t s3 = tmp[3];
+  int16x8_t s4 = tmp[4];
+  int16x8_t s5 = tmp[5];
+  int16x8_t s6 = tmp[6];
+  int16x8_t s7 = tmp[7];
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  int16x8_t f[8];
+  load_filters_8(f, sy, gamma);
+
+  int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+  int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+  int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+  int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+  int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]);
+  int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]);
+  int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]);
+  int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]);
+
+  int64x2_t m01 = vpaddq_s64(m0, m1);
+  int64x2_t m23 = vpaddq_s64(m2, m3);
+  int64x2_t m45 = vpaddq_s64(m4, m5);
+  int64x2_t m67 = vpaddq_s64(m6, m7);
+
+  int32x4x2_t ret;
+  ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+  ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+  return ret;
+}
+
+void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref,
+                                int width, int height, int stride,
+                                uint16_t *pred, int p_col, int p_row,
+                                int p_width, int p_height, int p_stride,
+                                int subsampling_x, int subsampling_y, int bd,
+                                ConvolveParams *conv_params, int16_t alpha,
+                                int16_t beta, int16_t gamma, int16_t delta) {
+  highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+                            p_width, p_height, p_stride, subsampling_x,
+                            subsampling_y, bd, conv_params, alpha, beta, gamma,
+                            delta);
+}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index c5fe389ba1..ef999fbba2 100644
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -541,7 +541,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
 # WARPED_MOTION / GLOBAL_MOTION functions
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/;
+  specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
 }
 
 add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
index 20a1e12476..f68af18cb1 100644
--- a/third_party/aom/av1/common/reconintra.c
+++ b/third_party/aom/av1/common/reconintra.c
@@ -1368,7 +1368,7 @@ void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) {
   }
 }
 
-static void highbd_build_intra_predictors(
+static void highbd_build_directional_and_filter_intra_predictors(
     const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
     PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
     TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
@@ -1376,7 +1376,7 @@ static void highbd_build_intra_predictors(
     int bit_depth) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
   DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
   uint16_t *const above_row = above_data + 16;
@@ -1390,7 +1390,8 @@ static void highbd_build_intra_predictors(
   const uint16_t *left_ref = ref - 1;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (bit_depth - 8);
+  assert(use_filter_intra || is_dr_mode);
+  const int base = 128 << (bit_depth - 8);
   // The left_data, above_data buffers must be zeroed to fix some intermittent
   // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
   // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
@@ -1492,49 +1493,124 @@ static void highbd_build_intra_predictors(
     return;
   }
 
-  if (is_dr_mode) {
-    int upsample_above = 0;
-    int upsample_left = 0;
-    if (!disable_edge_filter) {
-      const int need_right = p_angle < 90;
-      const int need_bottom = p_angle > 180;
-      if (p_angle != 90 && p_angle != 180) {
-        const int ab_le = need_above_left ? 1 : 0;
-        if (need_above && need_left && (txwpx + txhpx >= 24)) {
-          highbd_filter_intra_edge_corner(above_row, left_col);
-        }
-        if (need_above && n_top_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
-          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
-          av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
-        }
-        if (need_left && n_left_px > 0) {
-          const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
-          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
-          av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
-        }
+  assert(is_dr_mode);
+  int upsample_above = 0;
+  int upsample_left = 0;
+  if (!disable_edge_filter) {
+    const int need_right = p_angle < 90;
+    const int need_bottom = p_angle > 180;
+    if (p_angle != 90 && p_angle != 180) {
+      const int ab_le = need_above_left ? 1 : 0;
+      if (need_above && need_left && (txwpx + txhpx >= 24)) {
+        highbd_filter_intra_edge_corner(above_row, left_col);
       }
-      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
-                                                   intra_edge_filter_type);
-      if (need_above && upsample_above) {
-        const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+      if (need_above && n_top_px > 0) {
+        const int strength = intra_edge_filter_strength(
+            txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
+        const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+        av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
       }
-      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
-                                                  intra_edge_filter_type);
-      if (need_left && upsample_left) {
-        const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+      if (need_left && n_left_px > 0) {
+        const int strength = intra_edge_filter_strength(
+            txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
+        const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+        av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
       }
     }
-    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, bit_depth);
+    upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                 intra_edge_filter_type);
+    if (need_above && upsample_above) {
+      const int n_px = txwpx + (need_right ? txhpx : 0);
+      av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+    }
+    upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                intra_edge_filter_type);
+    if (need_left && upsample_left) {
+      const int n_px = txhpx + (need_bottom ? txwpx : 0);
+      av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+    }
+  }
+  highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                      upsample_above, upsample_left, p_angle, bit_depth);
+}
+
+// For HBD encode/decode, this function generates the pred data of a given
+// block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H,
+// SMOOTH_V and PAETH).
+static void highbd_build_non_directional_intra_predictors(
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px,
+    int bit_depth) {
+  int i = 0;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+  const uint16_t *left_ref = ref - 1;
+  const int base = 128 << (bit_depth - 8);
+
+  assert(n_top_px >= 0);
+  assert(n_left_px >= 0);
+  assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+         mode == SMOOTH_H_PRED || mode == PAETH_PRED);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val = 0;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : base + 1;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      aom_memset16(dst, val, txwpx);
+      dst += dst_stride;
+    }
     return;
   }
 
-  // predict
+  DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+  uint16_t *const above_row = above_data + 16;
+  uint16_t *const left_col = left_data + 16;
+
+  if (need_left) {
+    aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+    if (n_left_px > 0) {
+      for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i);
+    } else if (n_top_px > 0) {
+      aom_memset16(left_col, above_ref[0], txhpx);
+    }
+  }
+
+  if (need_above) {
+    aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i));
+    } else if (n_left_px > 0) {
+      aom_memset16(above_row, left_ref[0], txwpx);
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = base;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
   if (mode == DC_PRED) {
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
         dst, dst_stride, above_row, left_col, bit_depth);
@@ -1660,12 +1736,19 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
   // separate function build_non_directional_intra_predictors() is introduced
   // for these modes to avoid redundant computations while generating pred data.
 
-  // TODO(aomedia:3532): Enable this refactoring for high bd path as well.
-  if (!is_hbd && !use_filter_intra && !is_dr_mode) {
-    build_non_directional_intra_predictors(
-        ref, ref_stride, dst, dst_stride, mode, tx_size,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_left ? AOMMIN(txhpx, yd + txhpx) : 0);
+  const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0;
+  const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0;
+  if (!use_filter_intra && !is_dr_mode) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      highbd_build_non_directional_intra_predictors(
+          ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px,
+          xd->bd);
+      return;
+    }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+    build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride,
+                                           mode, tx_size, n_top_px, n_left_px);
     return;
   }
 
@@ -1717,25 +1800,23 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
 
   const int disable_edge_filter = !enable_intra_edge_filter;
   const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
+  const int n_topright_px =
+      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right;
+  const int n_bottomleft_px =
+      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left;
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_hbd) {
-    highbd_build_intra_predictors(
+    highbd_build_directional_and_filter_intra_predictors(
         ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
-        tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
-        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
-        intra_edge_filter_type, xd->bd);
+        tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
+        n_bottomleft_px, intra_edge_filter_type, xd->bd);
     return;
   }
 #endif
   build_directional_and_filter_intra_predictors(
       ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
-      tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-      have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
-      have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-      have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
-      intra_edge_filter_type);
+      tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px,
+      n_bottomleft_px, intra_edge_filter_type);
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 4732ad435b..fe053af5cc 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -2594,15 +2594,19 @@ static int encode_without_recode(AV1_COMP *cpi) {
       if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
         const YV12_BUFFER_CONFIG *const ref =
             get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-        if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+        if (ref == NULL || ref->y_crop_width != cm->width ||
+            ref->y_crop_height != cm->height) {
           cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+        }
       }
     }
     if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
       const YV12_BUFFER_CONFIG *const ref =
           get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+      if (ref == NULL || ref->y_crop_width != cm->width ||
+          ref->y_crop_height != cm->height) {
         cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+      }
     }
   }
 
@@ -2700,10 +2704,13 @@ static int encode_without_recode(AV1_COMP *cpi) {
     update_motion_stat(cpi);
 
   // Adjust the refresh of the golden (longer-term) reference based on QP
-  // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+  // selected for this frame. This is for CBR real-time mode, and only
+  // for single layer without usage of the set_ref_frame_config (so
+  // reference structure for 1 layer is set internally).
   if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
       cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
       svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl &&
+      !cpi->ppi->rtc_ref.set_ref_frame_config &&
       sf->rt_sf.gf_refresh_based_on_qp)
     av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
 
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 5f6f67eda8..e87ab9be1f 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -3156,14 +3156,14 @@ typedef struct AV1_COMP {
   FRAME_INDEX_SET frame_index_set;
 
   /*!
-   * Store the cm->width in the last call of alloc_compressor_data(). Help
+   * Stores the cm->width in the last call of alloc_compressor_data(). Helps
    * determine whether compressor data should be reallocated when cm->width
    * changes.
    */
   int data_alloc_width;
 
   /*!
-   * Store the cm->height in the last call of alloc_compressor_data(). Help
+   * Stores the cm->height in the last call of alloc_compressor_data(). Helps
    * determine whether compressor data should be reallocated when cm->height
    * changes.
    */
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index 4e53447379..f3a9828cb3 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -1807,7 +1807,6 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv,
   }
 
   assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
-  assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width);
 
   switch (search_method) {
     case FAST_BIGDIA:
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index a6c0971096..63d69cadc5 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -1624,6 +1624,14 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
     sf->rt_sf.use_rtc_tf = 0;
     sf->rt_sf.nonrd_prune_ref_frame_search = 1;
   }
+  // rtc_tf feature allocates new source because of possible
+  // temporal filtering which may change the input source during encoding:
+  // this causes an issue on resized frames when psnr is calculated,
+  // so disable it here for frames that are resized (encoding width/height
+  // different from configured width/height).
+  if (is_psnr_calc_enabled(cpi) && (cpi->oxcf.frm_dim_cfg.width != cm->width ||
+                                    cpi->oxcf.frm_dim_cfg.height != cm->height))
+    sf->rt_sf.use_rtc_tf = 0;
 }
 
 // TODO(kyslov): now this is very similar to
diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c
index 4d77a1b427..db02ca6299 100644
--- a/third_party/aom/common/tools_common.c
+++ b/third_party/aom/common/tools_common.c
@@ -97,7 +97,7 @@ int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
     int w = aom_img_plane_width(yuv_frame, plane);
     const int h = aom_img_plane_height(yuv_frame, plane);
     int r;
-    // Assuming that for nv12 we read all chroma data at one time
+    // Assuming that for nv12 we read all chroma data at once
     if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break;
     if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2;
     /* Determine the correct plane based on the image format. The for-loop
@@ -245,17 +245,21 @@ uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) {
 
 void aom_img_write(const aom_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = aom_img_plane_width(img, plane) *
-                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = aom_img_plane_width(img, plane);
     const int h = aom_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we write all chroma data at once
+    if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break;
+    if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2;
+
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytespp, w, file);
       buf += stride;
     }
   }
@@ -268,12 +272,16 @@ bool aom_img_read(aom_image_t *img, FILE *file) {
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = aom_img_plane_width(img, plane) * bytespp;
+    int w = aom_img_plane_width(img, plane);
     const int h = aom_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we read all chroma data at once
+    if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break;
+    if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2;
+
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w) return false;
+      if (fread(buf, bytespp, w, file) != (size_t)w) return false;
       buf += stride;
     }
   }
diff --git a/third_party/aom/test/av1_c_vs_simd_encode.sh b/third_party/aom/test/av1_c_vs_simd_encode.sh
index 296204d118..897ac081c1 100755
--- a/third_party/aom/test/av1_c_vs_simd_encode.sh
+++ b/third_party/aom/test/av1_c_vs_simd_encode.sh
@@ -104,16 +104,16 @@ av1_c_vs_simd_enc_verify_environment () {
 # }
 
 # Echo AOM_SIMD_CAPS_MASK for different instruction set architecture.
-avx512f() {
+avx2() {
    echo "0x1FF"
 }
 
-avx2() {
-   echo "0x0FF"
+avx() {
+   echo "0x17F"
 }
 
-avx() {
-   echo "0x07F"
+sse4_2() {
+   echo "0x13F"
 }
 
 sse4_1() {
@@ -443,21 +443,21 @@ av1_test_generic() {
   done
 }
 
-# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are
-# no functions with MMX, SSE and AVX512 specialization.
+# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2 as
+# there are no functions with MMX, SSE and AVX512 specialization.
 # The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction
 # set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding
 # instruction set extension optimization enabled are as follows:
-# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
-#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
-#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
-#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
-#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
-#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
-#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
-#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
-#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
-#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+# SSE4_2 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX2 and lower variants
+#   1     0    1    1      1    1    1    1   1  -> 0x17F -> Enable AVX and lower variants
+#   1     0    0    1      1    1    1    1   1  -> 0x13F -> Enable SSE4_2 and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007 -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003 -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001 -> Enable MMX
 ## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as
 #  all x86_64 platforms implement sse2.
 av1_test_x86() {
@@ -478,8 +478,8 @@ av1_test_x86() {
     local cmake_command="cmake $LIBAOM_SOURCE_DIR"
   fi
 
-  # Available x86 isa variants: "avx2 avx sse4_1 ssse3 sse3 sse2"
-  local x86_isa_variants="avx2 sse4_1 sse2"
+  # Available x86 isa variants: "avx2 avx sse4_2 sse4_1 ssse3 sse3 sse2"
+  local x86_isa_variants="avx2 sse4_2 sse2"
 
   echo "Build for x86: ${target}"
   if ! av1_enc_build "${target}" "${cmake_command}"; then
diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc
index 3865810e9b..c23b08e481 100644
--- a/third_party/aom/test/dr_prediction_test.cc
+++ b/third_party/aom/test/dr_prediction_test.cc
@@ -10,6 +10,7 @@
  */
 
 #include <tuple>
+#include <vector>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -18,6 +19,7 @@
 
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
+#include "aom_ports/sanitizer.h"
 #include "av1/common/blockd.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/reconintra.h"
@@ -149,8 +151,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
  protected:
   static const int kMaxNumTests = 10000;
   static const int kIterations = 10;
-  static const int kDstStride = 64;
-  static const int kDstSize = kDstStride * kDstStride;
   static const int kOffset = 16;
   static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
 
@@ -161,9 +161,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     start_angle_ = params_.start_angle;
     stop_angle_ = start_angle_ + 90;
 
-    dst_ref_ = &dst_ref_data_[0];
-    dst_tst_ = &dst_tst_data_[0];
-    dst_stride_ = kDstStride;
     above_ = &above_data_[kOffset];
     left_ = &left_data_[kOffset];
 
@@ -171,16 +168,12 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
       above_data_[i] = rng_.Rand8();
       left_data_[i] = rng_.Rand8();
     }
-
-    for (int i = 0; i < kDstSize; ++i) {
-      dst_ref_[i] = 0;
-      dst_tst_[i] = 0;
-    }
   }
 
   ~DrPredTest() override = default;
 
-  void Predict(bool speedtest, int tx) {
+  void Predict(bool speedtest, int tx, Pixel *dst_ref, Pixel *dst_tst,
+               int dst_stride) {
     const int kNumTests = speedtest ? kMaxNumTests : 1;
     aom_usec_timer timer;
     int tst_time = 0;
@@ -189,7 +182,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
 
     aom_usec_timer_start(&timer);
     for (int k = 0; k < kNumTests; ++k) {
-      params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_,
+      params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above_, left_,
                      upsample_above_, upsample_left_, dx_, dy_, bd_);
     }
     aom_usec_timer_mark(&timer);
@@ -198,15 +191,17 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     if (params_.tst_fn) {
       aom_usec_timer_start(&timer);
       for (int k = 0; k < kNumTests; ++k) {
-        API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+        API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst, dst_stride, bw_, bh_,
                                                 above_, left_, upsample_above_,
                                                 upsample_left_, dx_, dy_, bd_));
       }
       aom_usec_timer_mark(&timer);
       tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
     } else {
-      for (int i = 0; i < kDstSize; ++i) {
-        dst_ref_[i] = dst_tst_[i];
+      for (int r = 0; r < bh_; ++r) {
+        for (int c = 0; c < bw_; ++c) {
+          dst_tst[r * dst_stride + c] = dst_ref[r * dst_stride + c];
+        }
       }
     }
 
@@ -222,18 +217,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
       }
     }
     for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
-      if (params_.tst_fn == nullptr) {
-        for (int i = 0; i < kDstSize; ++i) {
-          dst_tst_[i] = (1 << bd_) - 1;
-          dst_ref_[i] = (1 << bd_) - 1;
-        }
-      } else {
-        for (int i = 0; i < kDstSize; ++i) {
-          dst_ref_[i] = 0;
-          dst_tst_[i] = 0;
-        }
-      }
-
       bw_ = tx_size_wide[kTxSize[tx]];
       bh_ = tx_size_high[kTxSize[tx]];
 
@@ -246,12 +229,31 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
         upsample_above_ = upsample_left_ = 0;
       }
 
-      Predict(speedtest, tx);
+      // Add additional padding to allow detection of over reads/writes when
+      // the transform width is equal to MAX_TX_SIZE.
+      const int dst_stride = MAX_TX_SIZE + 16;
+      std::vector<Pixel> dst_ref(dst_stride * bh_);
+      std::vector<Pixel> dst_tst(dst_stride * bh_);
+
+      for (int r = 0; r < bh_; ++r) {
+        ASAN_POISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_],
+                                  (dst_stride - bw_) * sizeof(Pixel));
+        ASAN_POISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_],
+                                  (dst_stride - bw_) * sizeof(Pixel));
+      }
+
+      Predict(speedtest, tx, dst_ref.data(), dst_tst.data(), dst_stride);
+
+      for (int r = 0; r < bh_; ++r) {
+        ASAN_UNPOISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_],
+                                    (dst_stride - bw_) * sizeof(Pixel));
+        ASAN_UNPOISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_],
+                                    (dst_stride - bw_) * sizeof(Pixel));
+      }
 
       for (int r = 0; r < bh_; ++r) {
         for (int c = 0; c < bw_; ++c) {
-          ASSERT_EQ(dst_ref_[r * dst_stride_ + c],
-                    dst_tst_[r * dst_stride_ + c])
+          ASSERT_EQ(dst_ref[r * dst_stride + c], dst_tst[r * dst_stride + c])
               << bw_ << "x" << bh_ << " r: " << r << " c: " << c
               << " dx: " << dx_ << " dy: " << dy_
               << " upsample_above: " << upsample_above_
@@ -292,18 +294,12 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
     }
   }
 
-  Pixel dst_ref_data_[kDstSize];
-  Pixel dst_tst_data_[kDstSize];
-
   Pixel left_data_[kBufSize];
   Pixel dummy_data_[kBufSize];
   Pixel above_data_[kBufSize];
 
-  Pixel *dst_ref_;
-  Pixel *dst_tst_;
   Pixel *above_;
   Pixel *left_;
-  int dst_stride_;
 
   int enable_upsample_;
   int upsample_above_;
diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc
index aa4084f9e4..605743f9be 100644
--- a/third_party/aom/test/encode_api_test.cc
+++ b/third_party/aom/test/encode_api_test.cc
@@ -654,6 +654,52 @@ TEST(EncodeAPI, AllIntraMode) {
   cfg.kf_max_dist = 1;
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
 }
-#endif
+
+TEST(EncodeAPI, AllIntraAndUsePsnr) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+            AOM_CODEC_OK);
+
+  aom_codec_ctx_t enc;
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_PSNR),
+            AOM_CODEC_OK);
+
+  aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+  const aom_codec_cx_pkt_t *pkt;
+  aom_codec_iter_t iter = nullptr;
+  while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) {
+      ASSERT_EQ(pkt->kind, AOM_CODEC_PSNR_PKT);
+    }
+  }
+
+  aom_img_free(image);
+  ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// A test that reproduces bug aomedia:3534.
+TEST(EncodeAPI, AllIntraAndNoRefLast) {
+  aom_codec_iface_t *iface = aom_codec_av1_cx();
+  aom_codec_enc_cfg_t cfg;
+  ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+            AOM_CODEC_OK);
+
+  aom_codec_ctx_t enc;
+  ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+  aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, AOM_EFLAG_NO_REF_LAST),
+            AOM_CODEC_OK);
+
+  aom_img_free(image);
+  ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+#endif  // !CONFIG_REALTIME_ONLY
 
 }  // namespace
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
index 7bad45300a..755d4e3d02 100644
--- a/third_party/aom/test/resize_test.cc
+++ b/third_party/aom/test/resize_test.cc
@@ -11,15 +11,17 @@
 
 #include <climits>
 #include <vector>
+
+#include "aom/aomcx.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "common/tools_common.h"
 #include "av1/encoder/encoder.h"
+#include "common/tools_common.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
-#include "test/video_source.h"
 #include "test/util.h"
+#include "test/video_source.h"
 #include "test/y4m_video_source.h"
 
 // Enable(1) or Disable(0) writing of the compressed bitstream.
@@ -403,7 +405,7 @@ class ResizeRealtimeTest
   ResizeRealtimeTest()
       : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)),
         set_scale_mode_(false), set_scale_mode2_(false),
-        set_scale_mode3_(false) {}
+        set_scale_mode3_(false), is_screen_(false) {}
   ~ResizeRealtimeTest() override = default;
 
   void PreEncodeFrameHook(libaom_test::VideoSource *video,
@@ -415,6 +417,8 @@ class ResizeRealtimeTest
       encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      if (is_screen_)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
     }
     if (set_scale_mode_) {
       struct aom_scaling_mode mode;
@@ -508,6 +512,7 @@ class ResizeRealtimeTest
   bool set_scale_mode_;
   bool set_scale_mode2_;
   bool set_scale_mode3_;
+  bool is_screen_;
 };
 
 // Check the AOME_SET_SCALEMODE control by downsizing to
@@ -740,6 +745,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
 TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  init_flags_ = AOM_CODEC_USE_PSNR;
   cfg_.g_w = 640;
   cfg_.g_h = 480;
   change_bitrate_ = true;
@@ -795,6 +801,63 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
 #endif
 }
 
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode for
+// screen content mode. Start at low target bitrate, raise the bitrate in the
+// middle of the clip (at frame# = frame_change_bitrate_), scaling-up should
+// occur after bitrate is increased.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRateScreen) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = true;
+  frame_change_bitrate_ = 120;
+  set_scale_mode_ = false;
+  set_scale_mode2_ = false;
+  set_scale_mode3_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  is_screen_ = true;
+  DefaultConfig();
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 100;
+  cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+  cfg_.g_forced_max_frame_width = 1280;
+  cfg_.g_forced_max_frame_height = 1280;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  unsigned int frame_number = 0;
+  int resize_down_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      if (frame_number < frame_change_bitrate_) {
+        // Verify that resize down occurs, before bitrate is increased.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+        resize_down_count++;
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+    frame_number++;
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get at least 1 resize event in this test.
+  ASSERT_GE(resize_down_count, 1)
+      << "Resizing down should occur at lease once.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
 class ResizeCspTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
index a493a1f4cb..e31f8f820c 100644
--- a/third_party/aom/test/variance_test.cc
+++ b/third_party/aom/test/variance_test.cc
@@ -2165,11 +2165,7 @@ INSTANTIATE_TEST_SUITE_P(
                       MseParams(4, 4, &aom_highbd_10_mse16x16_sve, 10),
                       MseParams(4, 3, &aom_highbd_10_mse16x8_sve, 10),
                       MseParams(3, 4, &aom_highbd_10_mse8x16_sve, 10),
-                      MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10),
-                      MseParams(4, 4, &aom_highbd_8_mse16x16_sve, 8),
-                      MseParams(4, 3, &aom_highbd_8_mse16x8_sve, 8),
-                      MseParams(3, 4, &aom_highbd_8_mse8x16_sve, 8),
-                      MseParams(3, 3, &aom_highbd_8_mse8x8_sve, 8)));
+                      MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10)));
 #endif  // HAVE_SVE
 
 const VarianceParams kArrayHBDVariance_c[] = {
diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc
index f0be7d226b..8844ba77ca 100644
--- a/third_party/aom/test/warp_filter_test.cc
+++ b/third_party/aom/test/warp_filter_test.cc
@@ -88,6 +88,12 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     SVE, AV1WarpFilterTest,
     libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SVE, AV1HighbdWarpFilterTest,
+    libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_sve));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_SVE
 
 }  // namespace
-- 
cgit v1.2.3