Merging upstream version 126.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
commit: d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree: eff41dc61d9f714852212739e6b3738b82a2af87 /media/libvpx/libvpx
parent: Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download: firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz
firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip
132 files changed, 6973 insertions, 5994 deletions
diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap
index bb0ddd95b2..7206b5ebec 100644
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 Johann <johann@duck.com> <johann.koenig@gmail.com>
@@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
 Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS
index 2db4a113e4..5515e26589 100644
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
 Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
@@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
 Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
@@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com>
 Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
@@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
 Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net>
 Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
 Linfeng Zhang <linfengz@google.com>
 Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
@@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
 Neil Birkbeck <neil.birkbeck@gmail.com>
 Nico Weber <thakis@chromium.org>
 Niveditha Rau <niveditha.rau@gmail.com>
@@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Wan-Teh Chang <wtc@google.com>
 Wonkap Jang <wonkap@google.com>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG
index 21070785ed..87f0d7f708 100644
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@@ -1,7 +1,79 @@
-20yy-mm-dd v1.14.0 "V Duck"
+2024-01-02 v1.14.0 "Venetian Duck"
   This release drops support for old C compilers, such as Visual Studio 2012
   and older, that disallow mixing variable declarations and statements (a C99
-  feature).
+  feature). It adds support for run-time CPU feature detection for Arm
+  platforms, as well as support for darwin23 (macOS 14).
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    Various new features for rate control library for real-time: SVC parallel
+    encoding, loopfilter level, support for frame dropping, and screen content.
+
+    New callback function send_tpl_gop_stats for vp9 external rate control
+    library, which can be used to transmit TPL stats for a group of pictures. A
+    public header vpx_tpl.h is added for the definition of TPL stats used in
+    this callback.
+
+    libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+  - Enhancement:
+    Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+    68%-151% speed up for high bitdepth.
+
+    Improvements on AVX2 and SSE optimizations.
+    Improvements on LSX optimizations for LoongArch.
+    42-49% speedup on speed 0 VoD encoding.
+    Android API level predicates.
+
+  - Bug fixes:
+    Fix to missing prototypes from the rtcd header.
+    Fix to segfault when total size is enlarged but width is smaller.
+    Fix to the build for arm64ec using MSVC.
+    Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+    Fix to -Wshadow warnings.
+    Fix to heap overflow in vpx_get4x4sse_cs_neon.
+    Fix to buffer overrun in highbd Neon subpel variance filters.
+    Added bitexact encode test script.
+    Fix to -Wl,-z,defs with Clang's sanitizers.
+    Fix to decoder stability after error & continued decoding.
+    Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+    Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+    Fix to fragments count before use.
+    Fix to a case where target bandwidth is 0 for SVC.
+    Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+    Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+    Fix to integer overflow in vp8,ratectrl.c.
+    Fix to integer overflow in vp9 svc.
+    Fix to avg_frame_bandwidth overflow.
+    Fix to per frame qp for temporal layers.
+    Fix to unsigned integer overflow in sse computation.
+    Fix to uninitialized mesh feature for BEST mode.
+    Fix to overflow in highbd temporal_filter.
+    Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+    Skip arm64_neon.h workaround w/VS >= 2019.
+    Fix to c vs avx mismatch of diamond_search_sad().
+    Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+    Fix to a bug in vpx_hadamard_32x32_neon().
+    Fix to Clang -Wunreachable-code-aggressive warnings.
+    Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+    Fix to -Wunreachable-code in mfqe_partition.
+    Force mode search on 64x64 if no mode is selected.
+    Fix to ubsan failure caused by left shift of negative.
+    Fix to integer overflow in calc_pframe_target_size.
+    Fix to float-cast-overflow in vp8_change_config().
+    Fix to a null ptr before use.
+    Conditionally skip using inter frames in speed features.
+    Remove invalid reference frames.
+    Disable intra mode search speed features conditionally.
+    Set nonrd keyframe under dynamic change of deadline for rtc.
+    Fix to scaled reference offsets.
+    Set skip_recode=0 in nonrd_pick_sb_modes.
+    Fix to an edge case when downsizing to one.
+    Fix to a bug in frame scaling.
+    Fix to pred buffer stride.
+    Fix to a bug in simple motion search.
+    Update frame size in actual encoding.
 
 2023-09-29 v1.13.1 "Ugly Duckling"
   This release contains two security related fixes. One each for VP8 and VP9.
diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README
index 4c25b15d81..6dbd164c34 100644
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@@ -1,5 +1,3 @@
-v1.13.1 Ugly Duckling
-
 Welcome to the WebM VP8/VP9 Codec SDK!
 
 COMPILING THE APPLICATIONS/LIBRARIES:
@@ -183,6 +181,44 @@ CODE STYLE:
 
   See also: http://clang.llvm.org/docs/ClangFormat.html
 
+PROFILE GUIDED OPTIMIZATION (PGO)
+  Profile Guided Optimization can be enabled for Clang builds using the
+  commands:
+
+  $ export CC=clang
+  $ export CXX=clang++
+  $ ../libvpx/configure  --enable-profile
+  $ make
+
+  Generate one or multiple PGO profile files by running vpxdec or vpxenc. For
+  example:
+
+  $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \
+    -o - > /dev/null
+
+  To convert and merge the raw profile files, use the llvm-profdata tool:
+
+  $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw
+
+  Then, rebuild the project with the new profile file:
+
+  $ make clean
+  $ ../libvpx/configure --use-profile=perf.profdata
+  $ make
+
+  Note: Always use the llvm-profdata from the toolchain that is used for
+  compiling the PGO-enabled binary.
+
+  To observe the improvements from a PGO-enabled build, enable and compare the
+  list of failed optimizations by using the -Rpass-missed compiler flag. For
+  example, to list the failed loop vectorizations:
+
+  $ ../libvpx/configure --use-profile=perf.profdata \
+    --extra-cflags=-Rpass-missed=loop-vectorize
+
+  For guidance on utilizing PGO files to identify potential optimization
+  opportunities, see: tools/README.pgo.md
+
 SUPPORT
   This library is an open source project supported by its community. Please
   email webm-discuss@webmproject.org for help.
diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk
index ba24f541b1..533f43c1c2 100644
--- a/media/libvpx/libvpx/build/make/Android.mk
+++ b/media/libvpx/libvpx/build/make/Android.mk
@@ -15,13 +15,9 @@ ifdef NDK_ROOT
 # In an Android project place a libvpx checkout in the jni directory.
 # Run the configure script from the jni directory.  Base libvpx
 # encoder/decoder configuration will look similar to:
-# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+# ./libvpx/configure --target=arm64-android-gcc --disable-examples \
 #                    --enable-external-build
 #
-# When targeting Android, realtime-only is enabled by default.  This can
-# be overridden by adding the command line flag:
-#  --disable-realtime-only
-#
 # This will create .mk files that contain variables that contain the
 # source files to compile.
 #
@@ -38,11 +34,14 @@ ifdef NDK_ROOT
 # but the resulting library *must* be run on devices supporting all of the
 # enabled extensions. They can be disabled individually with
 #   --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
-#   --disable-neon[-asm]
+#   --disable-neon{, -asm, -neon-dotprod, -neon-i8mm}
+#   --disable-sve
 #   --disable-{dspr2, msa}
 
 #
-# Running ndk-build will build libvpx and include it in your project.
+# Running ndk-build will build libvpx and include it in your project. Set
+# APP_ABI to match the --target passed to configure:
+# https://developer.android.com/ndk/guides/application_mk#app_abi.
 #
 
 CONFIG_DIR := $(LOCAL_PATH)/
diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile
index 199ed78058..658b37617b 100644
--- a/media/libvpx/libvpx/build/make/Makefile
+++ b/media/libvpx/libvpx/build/make/Makefile
@@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
 $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
 $(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
 $(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2
+$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2
 
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh
index 869793a296..009bf7db5c 100644
--- a/media/libvpx/libvpx/build/make/configure.sh
+++ b/media/libvpx/libvpx/build/make/configure.sh
@@ -74,6 +74,8 @@ Build options:
   --cpu=CPU                   optimize for a specific cpu rather than a family
   --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
   --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+  --use-profile=PROFILE_FILE
+                              Use PROFILE_FILE for PGO
   ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
   ${toggle_werror}            treat warnings as errors, if possible
                               (not available with all compilers)
@@ -81,6 +83,7 @@ Build options:
   ${toggle_pic}               turn on/off Position Independent Code
   ${toggle_ccache}            turn on/off compiler cache
   ${toggle_debug}             enable/disable debug mode
+  ${toggle_profile}           enable/disable profiling
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
   ${toggle_gcov}              enable/disable gcov coverage instrumentation
   ${toggle_thumb}             enable/disable building arm assembly in thumb mode
@@ -429,6 +432,26 @@ check_gcc_machine_options() {
   fi
 }
 
+check_neon_sve_bridge_compiles() {
+  if enabled sve; then
+    check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+EOF
+    compile_result=$?
+    if [ ${compile_result} -ne 0 ]; then
+      log_echo "  disabling sve: arm_neon_sve_bridge.h not supported by compiler"
+      log_echo "  disabling sve2: arm_neon_sve_bridge.h not supported by compiler"
+      disable_feature sve
+      disable_feature sve2
+      RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 "
+    fi
+  fi
+}
+
 check_gcc_avx512_compiles() {
   if disabled gcc; then
     return
@@ -611,6 +634,9 @@ process_common_cmdline() {
       --extra-cxxflags=*)
         extra_cxxflags="${optval}"
         ;;
+      --use-profile=*)
+        pgo_file=${optval}
+        ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if is_in ${option} ${ARCH_EXT_LIST}; then
@@ -951,7 +977,7 @@ EOF
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin2[0-2]-*)
+    *-darwin2[0-3]-*)
       add_cflags  "-arch ${toolchain%%-*}"
       add_ldflags "-arch ${toolchain%%-*}"
       ;;
@@ -980,36 +1006,18 @@ EOF
   case ${toolchain} in
     arm*)
       soft_enable runtime_cpu_detect
-      # Arm ISA extensions are treated as supersets.
-      case ${tgt_isa} in
-        arm64|armv8)
-          for ext in ${ARCH_EXT_LIST_AARCH64}; do
-            # Disable higher order extensions to simplify dependencies.
-            if [ "$disable_exts" = "yes" ]; then
-              if ! disabled $ext; then
-                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
-                disable_feature $ext
-              fi
-            elif disabled $ext; then
-              disable_exts="yes"
-            else
-              soft_enable $ext
-            fi
-          done
-          ;;
-        armv7|armv7s)
-          soft_enable neon
-          # Only enable neon_asm when neon is also enabled.
-          enabled neon && soft_enable neon_asm
-          # If someone tries to force it through, die.
-          if disabled neon && enabled neon_asm; then
-            die "Disabling neon while keeping neon-asm is not supported"
-          fi
-          ;;
-      esac
 
-      asm_conversion_cmd="cat"
+      if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+        soft_enable neon
+        # Only enable neon_asm when neon is also enabled.
+        enabled neon && soft_enable neon_asm
+        # If someone tries to force it through, die.
+        if disabled neon && enabled neon_asm; then
+          die "Disabling neon while keeping neon-asm is not supported"
+        fi
+      fi
 
+      asm_conversion_cmd="cat"
       case ${tgt_cc} in
         gcc)
           link_with_cc=gcc
@@ -1228,6 +1236,38 @@ EOF
           fi
           ;;
       esac
+
+      # AArch64 ISA extensions are treated as supersets.
+      if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+        aarch64_arch_flag_neon="arch=armv8-a"
+        aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod"
+        aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm"
+        aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve"
+        aarch64_arch_flag_sve2="arch=armv9-a+sve2"
+        for ext in ${ARCH_EXT_LIST_AARCH64}; do
+          if [ "$disable_exts" = "yes" ]; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            soft_disable $ext
+          else
+            # Check the compiler supports the -march flag for the extension.
+            # This needs to happen after toolchain/OS inspection so we handle
+            # $CROSS etc correctly when checking for flags, else these will
+            # always fail.
+            flag="$(eval echo \$"aarch64_arch_flag_${ext}")"
+            check_gcc_machine_option "${flag}" "${ext}"
+            if ! enabled $ext; then
+              # Disable higher order extensions to simplify dependencies.
+              disable_exts="yes"
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+              soft_disable $ext
+            fi
+          fi
+        done
+        if enabled sve; then
+          check_neon_sve_bridge_compiles
+        fi
+      fi
+
       ;;
     mips*)
       link_with_cc=gcc
@@ -1484,6 +1524,14 @@ EOF
       ;;
   esac
 
+  # Enable PGO
+  if [ -n "${pgo_file}" ]; then
+   check_add_cflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by compiler"
+   check_add_ldflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by linker"
+  fi
+
   # Try to enable CPU specific tuning
   if [ -n "${tune_cpu}" ]; then
     if [ -n "${tune_cflags}" ]; then
@@ -1504,6 +1552,9 @@ EOF
   else
     check_add_cflags -DNDEBUG
   fi
+  enabled profile &&
+    check_add_cflags -fprofile-generate &&
+    check_add_ldflags -fprofile-generate
 
   enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
   enabled gcov &&
diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl
index 0b9e16738e..025238d678 100755
--- a/media/libvpx/libvpx/build/make/rtcd.pl
+++ b/media/libvpx/libvpx/build/make/rtcd.pl
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/);
   @REQUIRES = filter(qw/neon/);
   &require(@REQUIRES);
   arm;
diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
index b212e0709d..97e78996e8 100755
--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64="
     neon_dotprod
     neon_i8mm
     sve
+    sve2
 "
 
 ARCH_EXT_LIST_X86="
@@ -376,6 +377,7 @@ CMDLINE_SELECT="
     install_libs
     install_srcs
     debug
+    profile
     gprof
     gcov
     pic
@@ -659,6 +661,7 @@ process_toolchain() {
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
         check_add_cflags -Wshadow
+        check_add_cflags -Wstrict-prototypes
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
@@ -677,6 +680,10 @@ process_toolchain() {
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
 
+        # Do not allow implicit vector type conversions on Clang builds (this
+        # is already the default on GCC builds).
+        check_add_cflags -flax-vector-conversions=none
+
         # Quiet gcc 6 vs 7 abi warnings:
         # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
         if enabled arm; then
diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c
index 5fb63e1660..083bd2519d 100644
--- a/media/libvpx/libvpx/examples/resize_util.c
+++ b/media/libvpx/libvpx/examples/resize_util.c
@@ -20,7 +20,7 @@
 
 static const char *exec_name = NULL;
 
-static void usage() {
+static void usage(void) {
   printf("Usage:\n");
   printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
          exec_name);
diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
index 998e4fb20d..4050c093cd 100644
--- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) {
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
       vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
       // Don't look for mismatch on top spatial and top temporal layers as they
-      // are non reference frames.
+      // are non reference frames. Don't look at frames whose top spatial layer
+      // is dropped.
       if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+          cx_pkt->data.frame
+              .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] &&
           !(layer_id.temporal_layer_id > 0 &&
-            layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 &&
-            cx_pkt->data.frame
-                .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) {
+            layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) {
         test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
       }
 #endif
diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
index 1a0823153b..6e12d668b0 100644
--- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
@@ -60,7 +60,7 @@
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <width> <height> <infile> <outfile> "
           "<frame> <limit(optional)>\n",
diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template
index 1ee442af3e..6d05162d00 100644
--- a/media/libvpx/libvpx/libs.doxy_template
+++ b/media/libvpx/libvpx/libs.doxy_template
@@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-
-DOT_TRANSPARENT        = YES
-
 # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10)
diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk
index ff1c569c3b..5964386710 100644
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 8
+SO_VERSION_MAJOR := 9
 SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 1
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 SHARED_LIB_SUF          := .dylib
diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py
index 1c69740d2b..98ce7b1947 100644
--- a/media/libvpx/libvpx/test/android/get_files.py
+++ b/media/libvpx/libvpx/test/android/get_files.py
@@ -38,7 +38,7 @@ def get_file_sha(filename):
         buf = file.read(HASH_CHUNK)
       return sha_hash.hexdigest()
   except IOError:
-    print "Error reading " + filename
+    print("Error reading " + filename)
 
 # Downloads a file from a url, and then checks the sha against the passed
 # in sha
@@ -67,7 +67,7 @@ try:
       getopt.getopt(sys.argv[1:], \
                     "u:i:o:", ["url=", "input_csv=", "output_dir="])
 except:
-  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  print('get_files.py -u <url> -i <input_csv> -o <output_dir>')
   sys.exit(2)
 
 for opt, arg in opts:
@@ -79,7 +79,7 @@ for opt, arg in opts:
     local_resource_path = os.path.join(arg)
 
 if len(sys.argv) != 7:
-  print "Expects two paths and a url!"
+  print("Expects two paths and a url!")
   exit(1)
 
 if not os.path.isdir(local_resource_path):
@@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb")
 
 # Our 'csv' file uses multiple spaces as a delimiter, python's
 # csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \
     for line in file_list_csv), delimiter = ' ')
 
 file_shas = []
@@ -104,15 +104,16 @@ for row in file_list_reader:
 file_list_csv.close()
 
 # Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
+for filename, sha in zip(file_names, file_shas):
+  filename = filename.lstrip('*')
   path = os.path.join(local_resource_path, filename)
   if os.path.isfile(path) \
       and get_file_sha(path) == sha:
-    print path + ' exists, skipping'
+    print(path + ' exists, skipping')
     continue
   for retry in range(0, ftp_retries):
-    print "Downloading " + path
+    print("Downloading " + path)
     if not download_and_check_sha(url, filename, sha):
-      print "Sha does not match, retrying..."
+      print("Sha does not match, retrying...")
     else:
       break
diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc
index ede9c0ba8c..7816912ff7 100644
--- a/media/libvpx/libvpx/test/avg_test.cc
+++ b/media/libvpx/libvpx/test/avg_test.cc
@@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(1024, &vp9_block_error_fp_neon)));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve),
+                      make_tuple(64, &vp9_block_error_fp_sve),
+                      make_tuple(256, &vp9_block_error_fp_sve),
+                      make_tuple(1024, &vp9_block_error_fp_sve)));
+#endif  // HAVE_SVE
+
 #if HAVE_MSA
 INSTANTIATE_TEST_SUITE_P(
     MSA, AverageTest,
diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h
index c7e8f54847..179ccdf011 100644
--- a/media/libvpx/libvpx/test/codec_factory.h
+++ b/media/libvpx/libvpx/test/codec_factory.h
@@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8;
               &libvpx_test::kVP8)),                                         \
           __VA_ARGS__))
 #else
-#define VP8_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "")
 #endif  // CONFIG_VP8
 
 /*
@@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9;
               &libvpx_test::kVP9)),                                         \
           __VA_ARGS__))
 #else
-#define VP9_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "")
 #endif  // CONFIG_VP9
 
 }  // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc
index ffd5c41c63..11f7625137 100644
--- a/media/libvpx/libvpx/test/convolve_test.cc
+++ b/media/libvpx/libvpx/test/convolve_test.cc
@@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12)
 WRAP(convolve8_avg_neon, 12)
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+WRAP(convolve8_horiz_sve, 8)
+WRAP(convolve8_avg_horiz_sve, 8)
+WRAP(convolve8_horiz_sve, 10)
+WRAP(convolve8_avg_horiz_sve, 10)
+WRAP(convolve8_horiz_sve, 12)
+WRAP(convolve8_avg_horiz_sve, 12)
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+WRAP(convolve8_vert_sve2, 8)
+WRAP(convolve8_avg_vert_sve2, 8)
+WRAP(convolve8_vert_sve2, 10)
+WRAP(convolve8_avg_vert_sve2, 10)
+WRAP(convolve8_vert_sve2, 12)
+WRAP(convolve8_avg_vert_sve2, 12)
+#endif  // HAVE_SVE2
+
 WRAP(convolve_copy_c, 8)
 WRAP(convolve_avg_c, 8)
 WRAP(convolve8_horiz_c, 8)
@@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
 #endif  // HAVE_NEON_DOTPROD
 
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8,
+    wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve),
+                                             ALL_SIZES(convolve10_sve),
+                                             ALL_SIZES(convolve12_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve2(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8,
+    wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve2(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10,
+    wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve2(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12,
+    wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2),
+                                              ALL_SIZES(convolve10_sve2),
+                                              ALL_SIZES(convolve12_sve2) };
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE2
+
 #if HAVE_NEON_I8MM
 const ConvolveFunctions convolve8_neon_i8mm(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc
index 508083673a..ca3b17a5d5 100644
--- a/media/libvpx/libvpx/test/encode_api_test.cc
+++ b/media/libvpx/libvpx/test/encode_api_test.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <cassert>
 #include <climits>
+#include <cstdint>
 #include <cstring>
 #include <initializer_list>
 #include <new>
@@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) {
          0;
 }
 
+void *Memset16(void *dest, int val, size_t length) {
+  uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest);
+  for (size_t i = 0; i < length; i++) {
+    *dest16++ = val;
+  }
+  return dest;
+}
+
+vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt,
+                         unsigned int width, unsigned int height) {
+  assert(fmt != VPX_IMG_FMT_NV12);
+  if (bit_depth > VPX_BITS_8) {
+    fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+  }
+  vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1);
+  if (!image) return image;
+
+  const int val = 1 << (bit_depth - 1);
+  const unsigned int uv_h =
+      (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift;
+  const unsigned int uv_w =
+      (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift;
+  if (bit_depth > VPX_BITS_8) {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      Memset16(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      Memset16(image->planes[1] + i * image->stride[1], val, uv_w);
+      Memset16(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  } else {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      memset(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      memset(image->planes[1] + i * image->stride[1], val, uv_w);
+      memset(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  }
+
+  return image;
+}
+
 TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
@@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) {
   ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
 
   // Generate random frame data and encode
-  uint8_t img[1280 * 720 * 3 / 2];
+  libvpx_test::RandomVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.SetImageFormat(VPX_IMG_FMT_I420);
+  video.Begin();
+  ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_BEST_QUALITY),
+            VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
+
+TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_threads = 1;
+  cfg.g_profile = 0;
+  cfg.g_w = 1;
+  cfg.g_h = 64;
+  cfg.g_bit_depth = VPX_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_dropframe_thresh = 0;  // Don't drop frames
+  cfg.rc_resize_allowed = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.rc_target_bitrate = 10;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = 10000;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  uint8_t img[1 * 64 * 3 / 2];
   libvpx_test::ACMRandom rng;
   for (size_t i = 0; i < sizeof(img); ++i) {
     img[i] = rng.Rand8();
@@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) {
   ASSERT_EQ(
       vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
       &img_wrapper);
-  ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+  vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 4294967;
+  // Set the scalability mode to L1T3.
+  cfg.ts_number_layers = 3;
+  cfg.ts_periodicity = 4;
+  cfg.ts_layer_id[0] = 0;
+  cfg.ts_layer_id[1] = 2;
+  cfg.ts_layer_id[2] = 1;
+  cfg.ts_layer_id[3] = 2;
+  cfg.ts_rate_decimator[0] = 4;
+  cfg.ts_rate_decimator[1] = 2;
+  cfg.ts_rate_decimator[2] = 1;
+  // Bitrate allocation L0: 50% L1: 20% L2: 30%
+  cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] =
+      50 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] =
+      70 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] =
+      cfg.rc_target_bitrate;
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2),
             VPX_CODEC_OK);
 
+  constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING =
+      VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+  // Layer 2: only reference last frame, no updates
+  // It only depends on layer 0
+  flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
   // Destroy libvpx encoder
   vpx_codec_destroy(&enc);
 }
-#endif
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP8Encoder {
+ public:
+  explicit VP8Encoder(int speed) : speed_(speed) {}
+  ~VP8Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP8Encoder::~VP8Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP8Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           vpx_enc_deadline_t deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP8Encoder::Encode(bool key_frame) {
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_),
+            VPX_CODEC_OK);
+  ++frame_index_;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+    if (key_frame) {
+      ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+    }
+  }
+  vpx_img_free(image);
+}
+
+// This is the reproducer testcase for crbug.com/324459561. However,
+// just running this test is not enough to reproduce the bug. We also
+// need to send signals to the test.
+TEST(EncodeAPI, Chromium324459561) {
+  VP8Encoder encoder(-12);
+
+  encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME);
+
+  encoder.Encode(true);
+  encoder.Encode(true);
+  encoder.Encode(true);
+
+  encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME);
+}
+#endif  // CONFIG_VP8_ENCODER
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
 // invalid configuration by setting the temporal layer rate allocation
@@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
   }
 }
 
+TEST(EncodeAPI, ConfigResizeBiggerAfterInit) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+    EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1080;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsSmallerThanInitialPts) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    // pts (10) is smaller than the initial pts (12).
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsOrDurationTooBig) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts, when converted to ticks, is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+    // pts + duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts + duration, when converted to ticks, is too big.
+#if ULONG_MAX > INT64_MAX
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
 #if CONFIG_VP9_ENCODER
 // Frame size needed to trigger the overflow exceeds the max buffer allowed on
 // 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
@@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
 }
 #endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 
-vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
-  vpx_image_t *image =
-      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
-  if (!image) return image;
-
-  for (unsigned int i = 0; i < image->d_h; ++i) {
-    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
-  }
-  const unsigned int uv_h = (image->d_h + 1) / 2;
-  const unsigned int uv_w = (image->d_w + 1) / 2;
-  for (unsigned int i = 0; i < uv_h; ++i) {
-    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
-    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
-  }
-
-  return image;
-}
-
 // Emulates the WebCodecs VideoEncoder interface.
 class VP9Encoder {
  public:
-  explicit VP9Encoder(int speed) : speed_(speed) {}
+  explicit VP9Encoder(int speed)
+      : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {}
+  // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set.
+  // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before
+  // passing the image format to vpx_img_alloc().
+  VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt)
+      : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {}
   ~VP9Encoder();
 
   void Configure(unsigned int threads, unsigned int width, unsigned int height,
@@ -558,6 +889,8 @@ class VP9Encoder {
 
  private:
   const int speed_;
+  const vpx_bit_depth_t bit_depth_;
+  const vpx_img_fmt_t fmt_;
   bool initialized_ = false;
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_ctx_t enc_;
@@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
   deadline_ = deadline;
 
   if (!initialized_) {
+    ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0);
+    const bool high_bit_depth = bit_depth_ > VPX_BITS_8;
+    const bool is_420 = fmt_ == VPX_IMG_FMT_I420;
     vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
     ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
               VPX_CODEC_OK);
     cfg_.g_threads = threads;
+    // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3,
+    // all other subsampling formats are allowed. In profiles 0 and 1, only bit
+    // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are
+    // allowed.
+    cfg_.g_profile = 2 * high_bit_depth + !is_420;
     cfg_.g_w = width;
     cfg_.g_h = height;
+    cfg_.g_bit_depth = bit_depth_;
+    cfg_.g_input_bit_depth = bit_depth_;
     cfg_.g_timebase.num = 1;
     cfg_.g_timebase.den = 1000 * 1000;  // microseconds
     cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
     cfg_.rc_end_usage = end_usage;
     cfg_.rc_min_quantizer = 2;
     cfg_.rc_max_quantizer = 58;
-    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(
+        vpx_codec_enc_init(&enc_, iface, &cfg_,
+                           high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0),
+        VPX_CODEC_OK);
     ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
     initialized_ = true;
     return;
@@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
 
 void VP9Encoder::Encode(bool key_frame) {
   const vpx_codec_cx_pkt_t *pkt;
-  vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+  vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h);
   ASSERT_NE(image, nullptr);
   const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
   ASSERT_EQ(
       vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
       VPX_CODEC_OK);
-  frame_index_++;
+  ++frame_index_;
   vpx_codec_iter_t iter = nullptr;
   while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
     ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
@@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) {
   encoder.Encode(false);
   encoder.Encode(false);
 }
+
+TEST(EncodeAPI, Buganizer317105128) {
+  VP9Encoder encoder(-9);
+  encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer319964497) {
+  VP9Encoder encoder(7);
+  encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc
index eea5647a78..6306e4f2ca 100644
--- a/media/libvpx/libvpx/test/frame_size_tests.cc
+++ b/media/libvpx/libvpx/test/frame_size_tests.cc
@@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
 // size or almost 1 gig of memory.
 // In total the allocations will exceed 2GiB which may cause a failure with
 // mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+#if defined(_WIN32) && !defined(_WIN64)
   video.SetSize(4096, 3072);
 #else
   video.SetSize(4096, 4096);
diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc
index f66f00b5c1..353c5043eb 100644
--- a/media/libvpx/libvpx/test/init_vpx_test.cc
+++ b/media/libvpx/libvpx/test/init_vpx_test.cc
@@ -57,6 +57,9 @@ void init_vpx_test() {
   if (!(caps & HAS_SVE)) {
     append_negative_gtest_filter(":SVE.*:SVE/*");
   }
+  if (!(caps & HAS_SVE2)) {
+    append_negative_gtest_filter(":SVE2.*:SVE2/*");
+  }
 #elif VPX_ARCH_ARM
   const int caps = arm_cpu_caps();
   if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc
index 20ad2229b4..f27bd7ebbc 100644
--- a/media/libvpx/libvpx/test/resize_test.cc
+++ b/media/libvpx/libvpx/test/resize_test.cc
@@ -7,8 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
-
 #include <climits>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
   mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
+#if CONFIG_VP9_DECODER
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
   int resize_count = 0;
@@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
     }
   }
 
-#if CONFIG_VP9_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
@@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc
index d3c76a34d2..57037f1e30 100644
--- a/media/libvpx/libvpx/test/sum_squares_test.cc
+++ b/media/libvpx/libvpx/test/sum_squares_test.cc
@@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_sum_squares_2d_i16_neon)));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_sve)));
+#endif  // HAVE_SVE
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, SumSquaresTest,
diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc
index b8320e9ceb..5cf6a5fb8e 100644
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@@ -29,6 +29,9 @@ namespace {
 
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
+typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                uint32_t *sse, int *sum);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
 
 using libvpx_test::ACMRandom;
@@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) {
  *  Our codebase calculates the "diff" value in the variance algorithm by
  *  (src - ref).
  */
-static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
-                             int l2h, int src_stride, int ref_stride,
-                             uint32_t *sse_ptr, bool use_high_bit_depth_,
-                             vpx_bit_depth_t bit_depth) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
+static void variance(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int w, int h, bool use_high_bit_depth_,
+                     uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) {
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      int diff;
+      int diff = 0;
       if (!use_high_bit_depth_) {
         diff = src[y * src_stride + x] - ref[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
                CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
+      se_long += diff;
+      sse_long += diff * diff;
     }
   }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
+
+  RoundHighBitDepth(bit_depth, &se_long, &sse_long);
+
+  *sse = sse_long;
+  *se = se_long;
+}
+
+static void get_variance_ref(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int l2w,
+                             int l2h, bool use_high_bit_depth_, uint32_t *sse,
+                             int *se, vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse = static_cast<uint32_t>(sse_long);
+  *se = static_cast<int>(se_long);
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
+                             vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse_ptr = static_cast<uint32_t>(sse_long);
   return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+      sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h)));
 }
 
 /* The subpel reference functions differ from the codec version in one aspect:
@@ -337,6 +370,9 @@ class MainTestClass
   void OneQuarterTest();
   void SpeedTest();
 
+  // GetVariance tests
+  void RefTestGetVar();
+
   // MSE/SSE tests
   void RefTestMse();
   void RefTestSse();
@@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Tests related to GetVariance.
+template <typename GetVarianceFunctionType>
+void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+    unsigned int sse1, sse2;
+    int sum1, sum2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(src_, stride, ref_, stride, &sse1, &sum1));
+    get_variance_ref(src_, stride, ref_, stride, params_.log2width,
+                     params_.log2height, use_high_bit_depth(), &sse2, &sum2,
+                     params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(sum1, sum2) << "Error at test index: " << i;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
 template <typename FunctionType>
@@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
 typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
 typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
 typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest;
 typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
 typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
 
@@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
+typedef TestParams<GetVarianceFunc> GetVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c)));
+
 typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
 INSTANTIATE_TEST_SUITE_P(
     C, VpxSubpelVarianceTest,
@@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest;
 typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
     VpxHBDSubpelAvgVarianceTest;
@@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    C, VpxHBDGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12),
+                      GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12),
+                      GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10),
+                      GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10),
+                      GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8),
+                      GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
@@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_variance4x4_sse2)));
 
 INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
@@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P(
         VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
@@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_neon),
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon)));
+
 #if HAVE_NEON_DOTPROD
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxSseTest,
@@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
                       VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
                       VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod)));
 #endif  // HAVE_NEON_DOTPROD
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
 
-// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
-// be used again.
-#if 0
 #if HAVE_NEON_DOTPROD
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxHBDMseTest,
@@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
 #endif  // HAVE_NEON_DOTPROD
-#endif  // 0
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10)));
+#endif  // HAVE_SVE
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
@@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P(
         VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8)));
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8)));
+#endif  // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
@@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
 #if HAVE_MSA
 INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
                          ::testing::Values(vpx_get_mb_ss_msa));
@@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_variance4x4_msa)));
 
 INSTANTIATE_TEST_SUITE_P(
+    MSA, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
     MSA, VpxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
@@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(3, 2, &vpx_variance8x4_vsx),
                       VarianceParams(2, 3, &vpx_variance4x8_vsx),
                       VarianceParams(2, 2, &vpx_variance4x4_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx)));
 #endif  // HAVE_VSX
 
 #if HAVE_MMI
diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h
index 2194126f1f..2c035910db 100644
--- a/media/libvpx/libvpx/test/video_source.h
+++ b/media/libvpx/libvpx/test/video_source.h
@@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource {
   RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
       : rnd_(seed), seed_(seed) {}
 
- protected:
   // Reset the RNG to get a matching stream for the second pass
   void Begin() override {
     frame_ = 0;
@@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource {
     FillFrame();
   }
 
+ protected:
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
   void FillFrame() override {
diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc
index aee27af66e..d47ed298fe 100644
--- a/media/libvpx/libvpx/test/vp8_datarate_test.cc
+++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc
@@ -14,7 +14,7 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 
 namespace {
 
@@ -260,6 +260,27 @@ class DatarateTestLarge
         << " The datarate for the file missed the target!";
   }
 
+  virtual void MultiThreadsPSNRTest() {
+    denoiser_on_ = 0;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_threads = 4;
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, 30);
+    cfg_.rc_target_bitrate = 1000;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0)
+        << " The datarate for the file missed the target!";
+  }
+
   vpx_codec_pts_t last_pts_;
   int64_t bits_in_buffer_model_;
   double timebase_;
@@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
   DropFramesMultiThreadsTest();
 }
 
+TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); }
+
 TEST_P(DatarateTestRealTime, RegionOfInterest) {
   denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
index 50478f7635..d87fef5a46 100644
--- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -149,9 +149,16 @@ class Vp8RcInterfaceTest
       return;
     }
     int qp;
+    libvpx::UVDeltaQP uv_delta_qp;
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
     if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
       ASSERT_EQ(rc_api_->GetQP(), qp);
+      uv_delta_qp = rc_api_->GetUVDeltaQP();
+      // delta_qp for UV channel is only set for screen.
+      if (!rc_cfg_.is_screen) {
+        ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0);
+        ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0);
+      }
     } else {
       num_drops_++;
     }
diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc
index 0645341ac1..c5ddcd58ab 100644
--- a/media/libvpx/libvpx/test/vp9_block_error_test.cc
+++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc
@@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = {
 INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
                          ::testing::ValuesIn(neon_block_error_tests));
 #endif  // HAVE_NEON
+
+#if HAVE_SVE
+const BlockErrorParam sve_block_error_tests[] = { make_tuple(
+    &BlockError8BitWrapper<vp9_block_error_sve>,
+    &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) };
+
+INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest,
+                         ::testing::ValuesIn(sve_block_error_tests));
+#endif  // HAVE_SVE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
index 33fa05c65c..5c23a5b0d5 100644
--- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -10,115 +10,78 @@
 
 #include <cstdint>
 #include <new>
+#include <memory>
+
+#include "./vpx_config.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/yuv_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#if CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
 #include "vp9/simple_encode.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_image.h"
 #include "vpx/vpx_tpl.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
 
-constexpr int kModelMagicNumber = 51396;
-constexpr uintptr_t PrivMagicNumber = 5566;
-constexpr int kFrameNum = 5;
-constexpr int kFrameNumGOP = 30;
-constexpr int kFrameNumGOPShort = 4;
-constexpr int kLosslessCodingIndex = 2;
-constexpr int kFixedGOPSize = 9;
-// The range check in vp9_cx_iface.c shows that the max
-// lag in buffer is MAX_LAG_BUFFERS (25):
-// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
-constexpr int kMaxLagInFrames = 25;
-constexpr int kDefaultMinGfInterval = 4;
-constexpr int kDefaultMaxGfInterval = 16;
-// The active gf interval might change for each GOP
-// See function "get_active_gf_inverval_range".
-// The numbers below are from manual inspection.
-constexpr int kReadMinGfInterval = 5;
-constexpr int kReadMaxGfInterval = 13;
-const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.4;
-
-struct ToyRateCtrl {
-  int magic_number;
-  int coding_index;
-
-  int gop_global_index;
-  int frames_since_key;
-  int show_index;
+constexpr int kFrameNum = 10;
+constexpr int kFixedGOPSize = 10;
+constexpr int kKeyframeQp = 10;
+constexpr int kLeafQp = 40;
+constexpr int kArfQp = 15;
+
+// Simple external rate controller for testing.
+class RateControllerForTest {
+ public:
+  RateControllerForTest() : current_gop_(-1) {}
+  ~RateControllerForTest() {}
+
+  void StartNextGop() { ++current_gop_; }
+
+  vpx_rc_gop_decision_t GetCurrentGop() const {
+    vpx_rc_gop_decision_t gop_decision;
+    gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0;
+    gop_decision.use_alt_ref = 1;
+    gop_decision.gop_coding_frames = kFixedGOPSize;
+    return gop_decision;
+  }
+
+  int CalculateFrameDecision(int frame_index) {
+    EXPECT_LE(frame_index, kFixedGOPSize);
+    if (current_gop_ == 0 && frame_index == 0) {
+      // Key frame, first frame in the first GOP.
+      return kKeyframeQp;
+    } else if (frame_index == 1) {
+      // ARF, we always use ARF for this test.
+      return kArfQp;
+    } else {
+      return kLeafQp;
+    }
+  }
+  int current_gop_;
 };
 
-vpx_rc_status_t rc_create_model(void *priv,
-                                const vpx_rc_config_t *ratectrl_config,
-                                vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->coding_index = -1;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 352);
-  EXPECT_EQ(ratectrl_config->frame_height, 288);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop(void *priv,
-                                    const vpx_rc_config_t *ratectrl_config,
-                                    vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_global_index = 0;
-  toy_rate_ctrl->frames_since_key = 0;
-  toy_rate_ctrl->show_index = 0;
-  toy_rate_ctrl->coding_index = 0;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 640);
-  EXPECT_EQ(ratectrl_config->frame_height, 360);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop_short(
-    void *priv, const vpx_rc_config_t *ratectrl_config,
+// Callbacks used in this test.
+vpx_rc_status_t rc_test_create_model(
+    void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/,
     vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_global_index = 0;
-  toy_rate_ctrl->frames_since_key = 0;
-  toy_rate_ctrl->show_index = 0;
-  toy_rate_ctrl->coding_index = 0;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 352);
-  EXPECT_EQ(ratectrl_config->frame_height, 288);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  std::unique_ptr<RateControllerForTest> test_controller(
+      new RateControllerForTest());
+  *rate_ctrl_model_ptr = test_controller.release();
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_send_firstpass_stats(
-    vpx_rc_model_t rate_ctrl_model,
+vpx_rc_status_t rc_test_send_firstpass_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/,
     const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(first_pass_stats->num_frames, kFrameNum);
   for (int i = 0; i < first_pass_stats->num_frames; ++i) {
     EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
@@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats(
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_send_firstpass_stats_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
-  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
-    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_firstpass_stats_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
-  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
-    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
-                                      const VpxTplGopStats *tpl_gop_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+vpx_rc_status_t rc_test_send_tpl_gop_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) {
   EXPECT_GT(tpl_gop_stats->size, 0);
 
   for (int i = 0; i < tpl_gop_stats->size; ++i) {
@@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_get_encodeframe_decision(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
+vpx_rc_status_t rc_test_get_encodeframe_decision(
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
     vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  toy_rate_ctrl->coding_index += 1;
-
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  EXPECT_LT(encode_frame_info->show_index, kFrameNum);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 4);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-  } else if (encode_frame_info->coding_index >= 2 &&
-             encode_frame_info->coding_index < 5) {
-    // In the first group of pictures, coding_index and gop_index are equal.
-    EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-  } else if (encode_frame_info->coding_index == 5) {
-    EXPECT_EQ(encode_frame_info->show_index, 4);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              1);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              1);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              4);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2],
-              1);  // kRefFrameTypeFuture
-  }
-  if (encode_frame_info->coding_index == kLosslessCodingIndex) {
-    // We should get sse == 0 at rc_update_encodeframe_result()
-    frame_decision->q_index = 0;
-  } else {
-    frame_decision->q_index = 100;
-  }
-  frame_decision->max_frame_size = 0;
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  frame_decision->q_index =
+      test_controller->CalculateFrameDecision(frame_gop_index);
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_get_encodeframe_decision_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 3 ||
-             encode_frame_info->coding_index == 12 ||
-             encode_frame_info->coding_index == 21) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-  } else if (encode_frame_info->coding_index == 11 ||
-             encode_frame_info->coding_index == 20 ||
-             encode_frame_info->coding_index == 29) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-  } else if (encode_frame_info->coding_index >= 30) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 3);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 4) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 3);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
-                                    const vpx_rc_gop_info_t *gop_info,
-                                    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
-  EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames =
-      VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
-  gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 3 coding frames, no alt ref.
-// The second GOP has 1 coding frame, no alt ref.
-vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
-                                          const vpx_rc_gop_info_t *gop_info,
-                                          vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
-  gop_decision->use_alt_ref = 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 4 coding frames. Use alt ref.
-// The second GOP only contains the overlay frame of the first GOP's alt ref
-// frame.
-vpx_rc_status_t rc_get_gop_decision_short_overlay(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
-  gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 1 GOP.
-// The GOP has 4 coding frames. Do not use alt ref.
-vpx_rc_status_t rc_get_gop_decision_short_no_arf(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
-  gop_decision->use_alt_ref = 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
-    EXPECT_EQ(encode_frame_result->sse, 0);
-  }
-  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
-    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
-  } else {
-    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_default_frame_rdmult(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  *rdmult = VPX_DEFAULT_RDMULT;
+vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                         vpx_rc_gop_decision_t *gop_decision) {
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  test_controller->StartNextGop();
+  *gop_decision = test_controller->GetCurrentGop();
   return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  delete toy_rate_ctrl;
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  delete test_controller;
   return VPX_RC_OK;
 }
 
 class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                         public ::testing::Test {
  protected:
-  ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
+  ExtRateCtrlTest()
+      : EncoderTest(&::libvpx_test::kVP9), frame_number_(0),
+        current_frame_qp_(0) {}
 
   ~ExtRateCtrlTest() override = default;
 
@@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_QP;
-      rc_funcs.create_model = rc_create_model;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
-      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTest, EncodeTest) {
-  cfg_.rc_target_bitrate = 24000;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
-      kFrameNum));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
-                           public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOP() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
-      rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
-      rc_funcs.get_gop_decision = rc_get_gop_decision;
-      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
-  cfg_.rc_target_bitrate = 4000;
-  cfg_.g_lag_in_frames = kMaxLagInFrames;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
-      kFrameNumGOP));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
-                                public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShort() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortOverlay
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShortOverlay() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision =
-          rc_get_encodeframe_decision_gop_short_overlay;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
+      rc_funcs.create_model = rc_test_create_model;
+      rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats;
+      rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats;
+      rc_funcs.get_gop_decision = rc_test_get_gop_decision;
+      rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision;
       rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
       encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
     }
   }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortNoARF
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShortNoARF() override = default;
 
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
+#if CONFIG_VP9_DECODER
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const ::libvpx_test::VideoSource & /*video*/,
+                          ::libvpx_test::Decoder *decoder) override {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    decoder->Control(VPXD_GET_LAST_QUANTIZER, &current_frame_qp_);
+    return VPX_CODEC_OK == res_dec;
   }
 
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision =
-          rc_get_encodeframe_decision_gop_short_no_arf;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    if (frame_number_ == 0) {
+      // This must be a key frame
+      EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0);
+      EXPECT_EQ(current_frame_qp_, kKeyframeQp);
+      ++frame_number_;
+      return;
     }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
-                              public ::testing::Test {
- protected:
-  ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestRdmult() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void BeginPassHook(unsigned int) override {
-    psnr_ = 0.0;
-    nframes_ = 0;
-  }
-
-  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
-    psnr_ += pkt->data.psnr.psnr[0];
-    nframes_++;
-  }
 
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) {
+      // This is ARF
+      EXPECT_EQ(current_frame_qp_, kArfQp);
+      ++frame_number_;
+      return;
     }
-  }
 
-  double GetAveragePsnr() const {
-    if (nframes_) return psnr_ / nframes_;
-    return 0.0;
+    EXPECT_EQ(current_frame_qp_, kLeafQp);
+    ++frame_number_;
   }
+#endif  // CONFIG_VP9_DECODER
 
- private:
-  double psnr_;
-  unsigned int nframes_;
+  int frame_number_;
+  int current_frame_qp_;
 };
 
-TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-  init_flags_ = VPX_CODEC_USE_PSNR;
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = 25;
 
   std::unique_ptr<libvpx_test::VideoSource> video;
   video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+      kFrameNum));
 
   ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-
-  const double psnr = GetAveragePsnr();
-  EXPECT_GT(psnr, kPsnrThreshold);
 }
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
index f7be47542c..a6c7563348 100644
--- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -9,6 +9,7 @@
  */
 #include "vp9/ratectrl_rtc.h"
 
+#include <climits>
 #include <fstream>  // NOLINT
 #include <string>
 
@@ -19,6 +20,8 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/video_source.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_ports/bitops.h"
 
diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc
index 049a10a617..a5a18a7e9d 100644
--- a/media/libvpx/libvpx/test/vp9_scale_test.cc
+++ b/media/libvpx/libvpx/test/vp9_scale_test.cc
@@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase,
   }
 
   void RunTest(INTERP_FILTER filter_type) {
-    static const int kNumSizesToTest = 20;
+    static const int kNumSizesToTest = 22;
     static const int kNumScaleFactorsToTest = 4;
-    static const int kSizesToTest[] = {
-      2,  4,  6,  8,  10, 12, 14, 16, 18,  20,
-      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
-    };
+    static const int kSizesToTest[] = { 1,  2,  3,  4,  6,   8,  10, 12,
+                                        14, 16, 18, 20, 22,  24, 26, 28,
+                                        30, 32, 34, 68, 128, 134 };
     static const int kScaleFactors[] = { 1, 2, 3, 4 };
     for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
       for (int h = 0; h < kNumSizesToTest; ++h) {
diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c
index 5c13781513..5af971f720 100644
--- a/media/libvpx/libvpx/tools_common.c
+++ b/media/libvpx/libvpx/tools_common.c
@@ -26,15 +26,9 @@
 
 #include "vpx/vpx_codec.h"
 
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
 #include <io.h>
 #include <fcntl.h>
-
-#ifdef __OS2__
-#define _setmode setmode
-#define _fileno fileno
-#define _O_BINARY O_BINARY
-#endif
 #endif
 
 #define LOG_ERROR(label)               \
@@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
 
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
   _setmode(_fileno(stream), _O_BINARY);
 #endif
   return stream;
@@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
     int w = vpx_img_plane_width(yuv_frame, plane);
     const int h = vpx_img_plane_height(yuv_frame, plane);
     int r;
-    // Assuming that for nv12 we read all chroma data at one time
+    // Assuming that for nv12 we read all chroma data at once
     if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
-    // Fixing NV12 chroma width it is odd
+    // Fixing NV12 chroma width if it is odd
     if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
     /* Determine the correct plane based on the image format. The for-loop
      * always counts in Y,U,V order, but this may not match the order of
@@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) {
 
 void vpx_img_write(const vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we write all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytespp, w, file);
       buf += stride;
     }
   }
@@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
 
 int vpx_img_read(vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we read all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      if (fread(buf, bytespp, w, file) != (size_t)w) return 0;
       buf += stride;
     }
   }
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index ee3c281f0f..a54e81084b 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positions are */
+  { 0, 0, -128, 0, 0, 0, 0, 0 },    /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
index fc4a3539fd..b9efc0cc1f 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
       p[0] = p[1] = 0;
     }
 
-    void init_bit_trees() {
+    void init_bit_trees(void) {
       init_bit_tree(cat1, 1);
       init_bit_tree(cat2, 2);
       init_bit_tree(cat3, 3);
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
index 71529bdfd8..7c8e083f4f 100644
--- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -25,23 +25,19 @@
 #include "vp8/common/systemdependent.h"
 
 #if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
-#elif defined(__OS2__)
-#define INCL_DOS
-#define INCL_DOSSPINLOCK
-#include <os2.h>
 #endif
 #endif
 
 #if CONFIG_MULTITHREAD
-static int get_cpu_count() {
+static int get_cpu_count(void) {
   int core_count = 16;
 
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #if defined(_SC_NPROCESSORS_ONLN)
   core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
@@ -49,38 +45,13 @@ static int get_cpu_count() {
 #endif
 #elif defined(_WIN32)
   {
-#if _WIN32_WINNT >= 0x0501
+#if _WIN32_WINNT < 0x0501
+#error _WIN32_WINNT must target Windows XP or newer.
+#endif
     SYSTEM_INFO sysinfo;
     GetNativeSystemInfo(&sysinfo);
-#else
-    PGNSI pGNSI;
-    SYSTEM_INFO sysinfo;
-
-    /* Call GetNativeSystemInfo if supported or
-     * GetSystemInfo otherwise. */
-
-    pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
-                                  "GetNativeSystemInfo");
-    if (pGNSI != NULL)
-      pGNSI(&sysinfo);
-    else
-      GetSystemInfo(&sysinfo);
-#endif
-
     core_count = (int)sysinfo.dwNumberOfProcessors;
   }
-#elif defined(__OS2__)
-  {
-    ULONG proc_id;
-    ULONG status;
-
-    core_count = 0;
-    for (proc_id = 1;; ++proc_id) {
-      if (DosGetProcessorStatus(proc_id, &status)) break;
-
-      if (status == PROC_ONLINE) core_count++;
-    }
-  }
 #else
 /* other platforms */
 #endif
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
index 1b70ea5dba..2038c000b0 100644
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -242,7 +242,7 @@ typedef struct {
 #endif
 } VP8_CONFIG;
 
-void vp8_initialize();
+void vp8_initialize(void);
 
 struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
index 09a0e2b4b3..102b7ccd54 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd.c
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -12,4 +12,4 @@
 #include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp8_rtcd() { once(setup_rtcd_internal); }
+void vp8_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
index 1cfb9fec51..0de75cfde3 100644
--- a/media/libvpx/libvpx/vp8/common/threading.h
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -19,161 +19,57 @@ extern "C" {
 
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
-/* Thread management macros */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
 /* Win32 */
-#include <process.h>
 #include <windows.h>
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREAD_FUNCTION \
-  __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREAD_FUNCTION unsigned int __stdcall
-#endif
-#define THREAD_FUNCTION_RETURN DWORD
-#define THREAD_SPECIFIC_INDEX DWORD
-#define pthread_t HANDLE
-#define pthread_attr_t DWORD
-#define pthread_detach(thread) \
-  if (thread != NULL) CloseHandle(thread)
-#define thread_sleep(nms) Sleep(nms)
-#define pthread_cancel(thread) terminate_thread(thread, 0)
-#define ts_key_create(ts_key, destructor) \
-  { ts_key = TlsAlloc(); };
-#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
-#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
-#define pthread_self() GetCurrentThreadId()
-
-#elif defined(__OS2__)
-/* OS/2 */
-#define INCL_DOS
-#include <os2.h>
-
-#include <stdlib.h>
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX PULONG
-#define pthread_t TID
-#define pthread_attr_t ULONG
-#define pthread_detach(thread) 0
-#define thread_sleep(nms) DosSleep(nms)
-#define pthread_cancel(thread) DosKillThread(thread)
-#define ts_key_create(ts_key, destructor) \
-  DosAllocThreadLocalMemory(1, &(ts_key));
-#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
-#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
-#define pthread_self() _gettid()
 #else
+/* pthreads */
 #ifdef __APPLE__
 #include <mach/mach_init.h>
 #include <mach/semaphore.h>
 #include <mach/task.h>
 #include <time.h>
 #include <unistd.h>
-
 #else
 #include <semaphore.h>
 #endif
-
-#include <pthread.h>
-/* pthreads */
-/* Nearly everything is already defined */
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX pthread_key_t
-#define ts_key_create(ts_key, destructor) \
-  pthread_key_create(&(ts_key), destructor);
 #endif
 
 /* Synchronization macros: Win32 and Pthreads */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
-#define sem_t HANDLE
-#define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) \
-  (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
-#define sem_wait(sem) \
+#define vp8_sem_t HANDLE
+#define vp8_sem_init(sem, pshared, value) \
+  (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL)
+#define vp8_sem_wait(sem) \
   (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
-#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
-#define sem_destroy(sem) \
+#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define vp8_sem_destroy(sem) \
   if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
 #define thread_sleep(nms) Sleep(nms)
 
-#elif defined(__OS2__)
-typedef struct {
-  HEV event;
-  HMTX wait_mutex;
-  HMTX count_mutex;
-  int count;
-} sem_t;
-
-static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
-  DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
-                    value > 0 ? TRUE : FALSE);
-  DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
-  DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
-
-  sem->count = value;
-
-  return 0;
-}
-
-static inline int sem_wait(sem_t *sem) {
-  DosRequestMutexSem(sem->wait_mutex, -1);
-
-  DosWaitEventSem(sem->event, -1);
-
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  sem->count--;
-  if (sem->count == 0) {
-    ULONG post_count;
-
-    DosResetEventSem(sem->event, &post_count);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  DosReleaseMutexSem(sem->wait_mutex);
-
-  return 0;
-}
-
-static inline int sem_post(sem_t *sem) {
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  if (sem->count < 32768) {
-    sem->count++;
-    DosPostEventSem(sem->event);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-static inline int sem_destroy(sem_t *sem) {
-  DosCloseEventSem(sem->event);
-  DosCloseMutexSem(sem->wait_mutex);
-  DosCloseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-#define thread_sleep(nms) DosSleep(nms)
-
 #else
 
 #ifdef __APPLE__
-#define sem_t semaphore_t
-#define sem_init(X, Y, Z) \
-  semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
-#define sem_wait(sem) (semaphore_wait(*sem))
-#define sem_post(sem) semaphore_signal(*sem)
-#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
+#define vp8_sem_t semaphore_t
+#define vp8_sem_init(sem, pshared, value) \
+  semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value)
+#define vp8_sem_wait(sem) semaphore_wait(*sem)
+#define vp8_sem_post(sem) semaphore_signal(*sem)
+#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
 #else
+#include <errno.h>
 #include <unistd.h>
 #include <sched.h>
+#define vp8_sem_t sem_t
+#define vp8_sem_init sem_init
+static INLINE int vp8_sem_wait(vp8_sem_t *sem) {
+  int ret;
+  while ((ret = sem_wait(sem)) == -1 && errno == EINTR) {
+  }
+  return ret;
+}
+#define vp8_sem_post sem_post
+#define vp8_sem_destroy sem_destroy
 #endif /* __APPLE__ */
 /* Not Windows. Assume pthreads */
 
@@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) {
 #define x86_pause_hint()
 #endif
 
-#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_atomics.h"
 
 static INLINE void vp8_atomic_spin_wait(
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
index 2248345ba2..88f2de024b 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
@@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
 
 #if CONFIG_MULTITHREAD
   if (setjmp(fb->pbi[0]->common.error.jmp)) {
+    fb->pbi[0]->common.error.setjmp = 0;
     vp8_remove_decoder_instances(fb);
     vp8_zero(fb->pbi);
     vpx_clear_system_state();
@@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
 
   /* decoder instance for single thread mode */
   remove_decompressor(pbi);
+  fb->pbi[0] = NULL;
   return VPX_CODEC_OK;
 }
 
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
index 1070849620..08a60b31b9 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "vpx_config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
@@ -94,8 +95,8 @@ typedef struct VP8D_COMP {
   DECODETHREAD_DATA *de_thread_data;
 
   pthread_t *h_decoding_thread;
-  sem_t *h_event_start_decoding;
-  sem_t h_event_end_decoding;
+  vp8_sem_t *h_event_start_decoding;
+  vp8_sem_t h_event_end_decoding;
 /* end of threading data */
 #endif
 
diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c
index 6ccb080cf9..d16284d134 100644
--- a/media/libvpx/libvpx/vp8/decoder/threading.c
+++ b/media/libvpx/libvpx/vp8/decoder/threading.c
@@ -15,6 +15,7 @@
 #endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/common.h"
 #include "vp8/common/threading.h"
 #include "vp8/common/loopfilter.h"
@@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   /* signal end of decoding of current thread for current frame */
   if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
-    sem_post(&pbi->h_event_end_decoding);
+    vp8_sem_post(&pbi->h_event_end_decoding);
 }
 
-static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
+static THREADFN thread_decoding_proc(void *p_data) {
   int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
   VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
   while (1) {
     if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
 
-    if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
+    if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
       if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
@@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
         if (setjmp(xd->error_info.jmp)) {
           xd->error_info.setjmp = 0;
           // Signal the end of decoding for current thread.
-          sem_post(&pbi->h_event_end_decoding);
+          vp8_sem_post(&pbi->h_event_end_decoding);
           continue;
         }
         xd->error_info.setjmp = 1;
         mt_decode_mb_rows(pbi, xd, ithread + 1);
+        xd->error_info.setjmp = 0;
       }
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 void vp8_decoder_create_threads(VP8D_COMP *pbi) {
@@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
     CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
     CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
 
-    if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
+    if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) {
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize semaphore");
     }
 
     for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
-      if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
+      if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
 
       vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
 
@@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 
       if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
                          thread_decoding_proc, &pbi->de_thread_data[ithread])) {
-        sem_destroy(&pbi->h_event_start_decoding[ithread]);
+        vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]);
         break;
       }
     }
@@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
       /* the remainder of cleanup cases will be handled in
        * vp8_decoder_remove_threads(). */
       if (pbi->allocated_decoding_thread_count == 0) {
-        sem_destroy(&pbi->h_event_end_decoding);
+        vp8_sem_destroy(&pbi->h_event_end_decoding);
       }
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to create threads");
@@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_post(&pbi->h_event_start_decoding[i]);
+      vp8_sem_post(&pbi->h_event_start_decoding[i]);
       pthread_join(pbi->h_decoding_thread[i], NULL);
     }
 
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_destroy(&pbi->h_event_start_decoding[i]);
+      vp8_sem_destroy(&pbi->h_event_start_decoding[i]);
     }
 
     if (pbi->allocated_decoding_thread_count) {
-      sem_destroy(&pbi->h_event_end_decoding);
+      vp8_sem_destroy(&pbi->h_event_end_decoding);
     }
 
     vpx_free(pbi->h_decoding_thread);
@@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
                              pbi->decoding_thread_count);
 
   for (i = 0; i < pbi->decoding_thread_count; ++i) {
-    sem_post(&pbi->h_event_start_decoding[i]);
+    vp8_sem_post(&pbi->h_event_start_decoding[i]);
   }
 
   if (setjmp(xd->error_info.jmp)) {
@@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     // the current frame while the main thread starts decoding the next frame,
     // which causes a data race.
     for (i = 0; i < pbi->decoding_thread_count; ++i)
-      sem_wait(&pbi->h_event_end_decoding);
+      vp8_sem_wait(&pbi->h_event_end_decoding);
     return -1;
   }
 
   xd->error_info.setjmp = 1;
   mt_decode_mb_rows(pbi, xd, 0);
+  xd->error_info.setjmp = 0;
 
   for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
-    sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+    vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
 
   return 0;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
index 82c48b13a7..d0117897db 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
@@ -7,38 +7,38 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
 #include <limits.h>
+#include <stdio.h>
 
 #include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "bitstream.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#if CONFIG_MULTITHREAD
-#include "ethreading.h"
-#endif
+
 #include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
 #include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "rdopt.h"
-#include "pickinter.h"
+#include "vp8/common/extend.h"
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/invtrans.h"
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/threading.h"
+#include "vp8/encoder/bitstream.h"
+#include "vp8/encoder/encodeframe.h"
+#include "vp8/encoder/encodeintra.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/encoder/encodemv.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/pickinter.h"
+#include "vp8/encoder/rdopt.h"
+#include "vp8/encoder/segmentation.h"
+#include "vp8_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
-#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-#include "bitstream.h"
+
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
 #endif
-#include "encodeframe.h"
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
@@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
         vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows;
@@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       }
       /* Wait for all the threads to finish. */
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_wait(&cpi->h_event_end_encoding[i]);
+        vp8_sem_wait(&cpi->h_event_end_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c
index e2f8b89d46..98c87d3cbc 100644
--- a/media/libvpx/libvpx/vp8/encoder/ethreading.c
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c
@@ -10,6 +10,7 @@
 #include <stddef.h>
 
 #include "onyx_int.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/threading.h"
 #include "vp8/common/common.h"
 #include "vp8/common/extend.h"
@@ -22,27 +23,27 @@
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x,
                                     int ok_to_skip);
 
-static THREAD_FUNCTION thread_loopfilter(void *p_data) {
+static THREADFN thread_loopfilter(void *p_data) {
   VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
     if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_lpf) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
       if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
-      sem_post(&cpi->h_event_end_lpf);
+      vp8_sem_post(&cpi->h_event_end_lpf);
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
-static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
+static THREADFN thread_encoding_proc(void *p_data) {
   int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
   VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
   while (1) {
     if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
       VP8_COMMON *cm = &cpi->common;
       int mb_row;
@@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
       }
       /* Signal that this thread has completed processing its rows. */
-      sem_post(&cpi->h_event_end_encoding[ithread]);
+      vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
     }
   }
 
   /* printf("exit thread %d\n", ithread); */
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
@@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
@@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
       vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
 
-      sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
-      sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
 
       ethd->ithread = ithread;
       ethd->ptr1 = (void *)cpi;
@@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       /* shutdown other threads */
       vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
-        sem_post(&cpi->h_event_start_encoding[ithread]);
-        sem_post(&cpi->h_event_end_encoding[ithread]);
+        vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
-        sem_destroy(&cpi->h_event_start_encoding[ithread]);
-        sem_destroy(&cpi->h_event_end_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
       }
 
       /* free thread related resources */
@@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     {
       LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data;
 
-      sem_init(&cpi->h_event_start_lpf, 0, 0);
-      sem_init(&cpi->h_event_end_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_start_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_end_lpf, 0, 0);
 
       lpfthd->ptr1 = (void *)cpi;
       rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
@@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
         /* shutdown other threads */
         vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
-          sem_post(&cpi->h_event_start_encoding[ithread]);
-          sem_post(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
           pthread_join(cpi->h_encoding_thread[ithread], 0);
-          sem_destroy(&cpi->h_event_start_encoding[ithread]);
-          sem_destroy(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
         }
-        sem_destroy(&cpi->h_event_end_lpf);
-        sem_destroy(&cpi->h_event_start_lpf);
+        vp8_sem_destroy(&cpi->h_event_end_lpf);
+        vp8_sem_destroy(&cpi->h_event_start_lpf);
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
@@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
       int i;
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
-        sem_post(&cpi->h_event_end_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_end_encoding[i]);
 
         pthread_join(cpi->h_encoding_thread[i], 0);
 
-        sem_destroy(&cpi->h_event_start_encoding[i]);
-        sem_destroy(&cpi->h_event_end_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[i]);
       }
 
-      sem_post(&cpi->h_event_start_lpf);
+      vp8_sem_post(&cpi->h_event_start_lpf);
       pthread_join(cpi->h_filter_thread, 0);
     }
 
-    sem_destroy(&cpi->h_event_end_lpf);
-    sem_destroy(&cpi->h_event_start_lpf);
+    vp8_sem_destroy(&cpi->h_event_end_lpf);
+    vp8_sem_destroy(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 0;
 
     /* free thread related resources */
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
index 4e128e3c49..ad01c6fc86 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@@ -63,7 +63,7 @@
 extern int vp8_update_coef_context(VP8_COMP *cpi);
 #endif
 
-extern unsigned int vp8_get_processor_freq();
+extern unsigned int vp8_get_processor_freq(void);
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
@@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) {
   int64_t llden = denom;
   int64_t llval = val;
 
-  return (int)(llval * llnum / llden);
+  int64_t result = (llval * llnum / llden);
+  if (result <= INT_MAX)
+    return (int)result;
+  else
+    return INT_MAX;
 }
 
 void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
@@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
-  lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+  if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000)
+    lc->target_bandwidth = INT_MAX;
+  else
+    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
 
   lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
   lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
@@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
       lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i];
-      lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+      if (oxcf->target_bitrate[i] > INT_MAX / 1000)
+        lc->target_bandwidth = INT_MAX;
+      else
+        lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
 
       lc->starting_buffer_level = rescale(
           (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000);
@@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
 
 #if CONFIG_MULTITHREAD
   if (vp8cx_create_encoder_threads(cpi)) {
+    cpi->common.error.setjmp = 0;
     vp8_remove_compressor(&cpi);
     return 0;
   }
@@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
 
   vp8_loop_filter_init(cm);
 
-  cpi->common.error.setjmp = 0;
-
 #if CONFIG_MULTI_RES_ENCODING
 
   /* Calculate # of MBs in a row in lower-resolution level image. */
@@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   vp8_setup_block_ptrs(&cpi->mb);
   vp8_setup_block_dptrs(&cpi->mb.e_mbd);
 
+  cpi->common.error.setjmp = 0;
+
   return cpi;
 }
 
@@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
 #if CONFIG_MULTITHREAD
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
-    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+    /* signal that we have set filter_level */
+    vp8_sem_post(&cpi->h_event_end_lpf);
   }
 #endif
 
@@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #if CONFIG_MULTITHREAD
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* start loopfilter in separate thread */
-    sem_post(&cpi->h_event_start_lpf);
+    vp8_sem_post(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 1;
     /* wait for the filter_level to be picked so that we can continue with
      * stream packing */
-    sem_wait(&cpi->h_event_end_lpf);
+    vp8_sem_wait(&cpi->h_event_end_lpf);
   } else
 #endif
   {
@@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
+#if CONFIG_MULTITHREAD
+  /* wait for the lpf thread done */
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
+    vp8_sem_wait(&cpi->h_event_end_lpf);
+    cpi->b_lpf_running = 0;
+  }
+#endif
+
   if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
     generate_psnr_packet(cpi);
   }
@@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 #endif
 #endif
 
-  cpi->common.error.setjmp = 0;
-
-#if CONFIG_MULTITHREAD
-  /* wait for the lpf thread done */
-  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
-    sem_wait(&cpi->h_event_end_lpf);
-    cpi->b_lpf_running = 0;
-  }
-#endif
-
   return 0;
 }
 
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
index 1451a27812..bb1518ed7f 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
@@ -20,6 +20,7 @@
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_pthread.h"
 #include "encodemb.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/entropy.h"
@@ -540,10 +541,10 @@ typedef struct VP8_COMP {
   LPFTHREAD_DATA lpf_thread_data;
 
   /* events */
-  sem_t *h_event_start_encoding;
-  sem_t *h_event_end_encoding;
-  sem_t h_event_start_lpf;
-  sem_t h_event_end_lpf;
+  vp8_sem_t *h_event_start_encoding;
+  vp8_sem_t *h_event_end_encoding;
+  vp8_sem_t h_event_start_lpf;
+  vp8_sem_t h_event_end_lpf;
 #endif
 
   TOKENLIST *tplist;
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
index fcd4eb04eb..7ba7a308ab 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
@@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
               (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
                     one_percent_bits);
         } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) {
-          percent_high =
-              (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8));
+          if (cpi->total_byte_count > 0) {
+            percent_high = (int)((100 * cpi->bits_off_target) /
+                                 (cpi->total_byte_count * 8));
+          } else {
+            percent_high = cpi->oxcf.over_shoot_pct;
+          }
         }
 
         if (percent_high > cpi->oxcf.over_shoot_pct) {
@@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     /* Calculate required scaling factor based on target frame size and
      * size of frame produced using previous Q
      */
-    if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) {
-      /* Case where we would overflow int */
-      target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs)
-                           << BPER_MB_NORMBITS;
+    if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) {
+      int temp = target_bits_per_frame / cpi->common.MBs;
+      if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+        target_bits_per_mb = INT_MAX;
+      } else {
+        target_bits_per_mb = temp << BPER_MB_NORMBITS;
+      }
     } else {
       target_bits_per_mb =
           (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       // undershoots significantly, and then we end up dropping every other
       // frame because the QP/rate_correction_factor may have been too low
       // before the drop and then takes too long to come up.
-      if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) {
-        target_bits_per_mb = (target_size / cpi->common.MBs)
-                             << BPER_MB_NORMBITS;
+      if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) {
+        int temp = target_size / cpi->common.MBs;
+        if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+          target_bits_per_mb = INT_MAX;
+        } else {
+          target_bits_per_mb = temp << BPER_MB_NORMBITS;
+        }
       } else {
         target_bits_per_mb =
             (target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h
index 47b5be17f1..5223aa2d86 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.h
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h
@@ -18,8 +18,6 @@
 extern "C" {
 #endif
 
-void vp8_tokenize_initialize();
-
 typedef struct {
   short Token;
   short Extra;
diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
index 1f16cc53d3..2b238c1a97 100644
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@@ -8,6 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -18,6 +23,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 #if CONFIG_MULTITHREAD
 #include "vp8/encoder/ethreading.h"
@@ -27,8 +33,6 @@
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
 #include "vp8/common/common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp8_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
@@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_profile, 3);
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
-  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
 #if CONFIG_REALTIME_ONLY
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
 #elif CONFIG_MULTI_RES_ENCODING
@@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
   set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
   vp8_change_config(ctx->cpi, &ctx->oxcf);
 #if CONFIG_MULTITHREAD
-  if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+  if (vp8cx_create_encoder_threads(ctx->cpi)) {
+    ctx->cpi->common.error.setjmp = 0;
+    return VPX_CODEC_ERROR;
+  }
 #endif
   ctx->cpi->common.error.setjmp = 0;
   return VPX_CODEC_OK;
@@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   return res;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
@@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
 
   if (deadline) {
     /* Convert duration parameter from stream timebase to microseconds */
-    uint64_t duration_us;
-
     VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
                       (TICKS_PER_SEC % 1000000) == 0);
 
-    duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
-                  (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+    if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) {
+      ERROR("duration is too big");
+    }
+    uint64_t duration_us =
+        duration * (uint64_t)ctx->timestamp_ratio.num /
+        ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
 
     /* If the deadline is more that the duration this frame is to be shown,
      * use good quality mode. Otherwise use realtime mode.
@@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.Mode = new_qc;
     vp8_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
@@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
 
   if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
 
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = pts_val;
-    ctx->pts_offset_initialized = 1;
-  }
-  pts_val -= ctx->pts_offset;
-
-  pick_quickcompress_mode(ctx, duration, deadline);
+  if (!res) res = pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // If no flags are set in the encode call, then use the frame flags as
@@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
-    YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
     size_t size, cx_data_sz;
     unsigned char *cx_data;
@@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     /* Convert API flags to internal codec lib flags */
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-    dst_time_stamp =
-        pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-    dst_end_time_stamp = (pts_val + (int64_t)duration) *
-                         ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts_val;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts_val < ctx->pts_offset) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts_val -= ctx->pts_offset;
+      if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      dst_time_stamp =
+          pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts_val > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts_val + (int64_t)duration;
+      if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
+      dst_end_time_stamp =
+          pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+
       res = image2yuvconfig(img, &sd);
 
       if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
           &dst_end_time_stamp, !img);
 
       if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) {
+        ctx->cpi->common.error.setjmp = 0;
         return VPX_CODEC_CORRUPT_FRAME;
       } else if (comp_data_state == -1) {
         break;
diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
index e81deaf4ea..fa7d7be403 100644
--- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
@@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
         pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
       }
-      pc->error.setjmp = 0;
+      pbi->common.error.setjmp = 0;
 #if CONFIG_MULTITHREAD
       if (pbi->restart_threads) {
         ctx->si.w = 0;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
index 261c316fd1..312092f190 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -8,10 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp8/vp8_ratectrl_rtc.h"
+
 #include <math.h>
+
 #include <new>
+
 #include "vp8/common/common.h"
-#include "vp8/vp8_ratectrl_rtc.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vp8/encoder/ratectrl.h"
 #include "vpx_ports/system_state.h"
@@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP(
 
 int VP8RateControlRTC::GetQP() const { return q_; }
 
+UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const {
+  VP8_COMMON *cm = &cpi_->common;
+  UVDeltaQP uv_delta_q;
+  uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q;
+  uv_delta_q.uvac_delta_q = cm->uvac_delta_q;
+  return uv_delta_q;
+}
+
 int VP8RateControlRTC::GetLoopfilterLevel() const {
   VP8_COMMON *cm = &cpi_->common;
   const double qp = q_;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
index 59fb607526..b458b5ce65 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -21,7 +21,6 @@ struct VP8_COMP;
 
 namespace libvpx {
 struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
   VP8RateControlRtcConfig() {
     memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
     memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
@@ -42,6 +41,9 @@ class VP8RateControlRTC {
   bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
+  // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest
+  // delta QP for UV.
+  UVDeltaQP GetUVDeltaQP() const;
   // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
   // level is calculated from frame qp.
   int GetLoopfilterLevel() const;
@@ -53,10 +55,10 @@ class VP8RateControlRTC {
   void PostEncodeUpdate(uint64_t encoded_frame_size);
 
  private:
-  VP8RateControlRTC() {}
+  VP8RateControlRTC() = default;
   bool InitRateControl(const VP8RateControlRtcConfig &cfg);
-  struct VP8_COMP *cpi_;
-  int q_;
+  struct VP8_COMP *cpi_ = nullptr;
+  int q_ = -1;
 };
 
 }  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
index 1cfc12f6fa..4c8fcf6989 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
@@ -13,7 +13,6 @@
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_util/vpx_thread.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
index 37762ca15a..1a93b97e56 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() { once(setup_rtcd_internal); }
+void vp9_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
index 3ecbd5417f..af3ff0e980 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
 add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-specialize qw/vp9_block_error_fp neon avx2 sse2/;
+specialize qw/vp9_block_error_fp neon sve avx2 sse2/;
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
@@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  specialize qw/vp9_block_error neon avx2 sse2/;
+  specialize qw/vp9_block_error neon sve avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
-  specialize qw/vp9_block_error neon avx2 msa sse2/;
+  specialize qw/vp9_block_error neon sve avx2 msa sse2/;
 }
 
 # fdct functions
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
index 8df18af3b8..24adbcbff0 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_reconinter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
index 5df0117f12..96c705d0d5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
@@ -12,6 +12,7 @@
 #define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
index c5892156f4..4fe680cefc 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -22,6 +22,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
@@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
+      worker->thread_name = "vpx tile worker";
       if (n < num_threads - 1 && !winterface->reset(worker)) {
         do {
           winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
index 5a7e9f9ab3..5c77df5002 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
   cm->error.setjmp = 0;
 
   vpx_get_worker_interface()->init(&pbi->lf_worker);
+  pbi->lf_worker.thread_name = "vpx lf worker";
 
   return pbi;
 }
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
index 2e198d552e..b3ee4eab5f 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
@@ -16,6 +16,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_thread_common.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
index 9a31f5a6d0..926ae87739 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
@@ -12,6 +12,7 @@
 #include <string.h>
 
 #include "vpx/vpx_integer.h"
+#include "vpx_util/vpx_pthread.h"
 
 #include "vp9/decoder/vp9_job_queue.h"
 
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
index bc23bf9c2c..59f71fb9ba 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
@@ -11,7 +11,7 @@
 #ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
 #define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
 
-#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_pthread.h"
 
 typedef struct {
   // Pointer to buffer base which contains the jobs
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
new file mode 100644
index 0000000000..78e7361d85
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz) {
+  int64x2_t err_v = vdupq_n_s64(0);
+  int64x2_t ssz_v = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err_v = vpx_dotq_s16(err_v, diff0, diff0);
+    err_v = vpx_dotq_s16(err_v, diff1, diff1);
+
+    ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
+    ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_v);
+  return horizontal_add_int64x2(err_v);
+}
+
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff, int block_size) {
+  int64x2_t err = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err = vpx_dotq_s16(err, diff0, diff0);
+    err = vpx_dotq_s16(err, diff1, diff1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return horizontal_add_int64x2(err);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
index 7fa00cd194..6542794667 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -11,8 +11,6 @@
 #ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
 #define VPX_VP9_ENCODER_VP9_BLOCK_H_
 
-#include "vpx_util/vpx_thread.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
index 42073f756c..ee0fcd8729 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
     alloc_tree_contexts(cm, tree, 4);
-    tree->leaf_split[0] = this_leaf++;
-    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+    tree->u.leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0];
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
       tree->block_size = square[square_index];
-      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+      for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++;
       ++pc_tree_index;
     }
     ++square_index;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
index 4e301cc17d..51e13ba654 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -90,7 +90,7 @@ typedef struct PC_TREE {
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
-  };
+  } u;
   // Obtained from a simple motion search. Used by the ML based partition search
   // speed feature.
   MV mv;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
index 46291f4868..b24c85f406 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -21,7 +21,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_ports/system_state.h"
-
+#include "vpx_util/vpx_pthread.h"
 #if CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
 #endif  // CONFIG_MISMATCH_DEBUG
@@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
       assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-                 pc_tree->leaf_split[0]);
+                 pc_tree->u.leaf_split[0]);
       } else {
         encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
+                  pc_tree->u.split[0]);
         encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
+                  subsize, pc_tree->u.split[1]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
+                  subsize, pc_tree->u.split[2]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+                  subsize, pc_tree->u.split[3]);
       }
       break;
   }
@@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
       assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                   pc_tree->split[0]);
+                   pc_tree->u.split[0]);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                   subsize, pc_tree->split[1]);
+                   subsize, pc_tree->u.split[1]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                   subsize, pc_tree->split[2]);
+                   subsize, pc_tree->u.split[2]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
-                   output_enabled, subsize, pc_tree->split[3]);
+                   output_enabled, subsize, pc_tree->u.split[3]);
       break;
   }
 
@@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                         subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
+                         subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
@@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
                          tp, mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist, i != 3,
-                         pc_tree->split[i]);
+                         pc_tree->u.split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         continue;
 
       save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      pc_tree->u.split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
+                       &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none,
                        INT_MAX, INT64_MAX);
 
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
 
       if (i != 3)
         encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+                  split_subsize, pc_tree->u.split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
@@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
       features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
 
       for (i = 0; i < 4; ++i) {
-        const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+        const int64_t this_rd = pc_tree->u.split[i]->none.rdcost;
         const int rd_valid = this_rd > 0 && this_rd < 1000000000;
         // Ratio between sub-block RD and whole block RD.
         features[feature_index++] =
@@ -3958,19 +3958,19 @@ static void store_superblock_info(
   }
   // recursively traverse partition tree when partition is split.
   assert(pc_tree->partitioning == PARTITION_SPLIT);
-  store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4, col_start_4x4, partition_info,
                         motion_vector_info);
-  store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4, col_start_4x4 + subblock_square_size_4x4,
                         partition_info, motion_vector_info);
-  store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4 + subblock_square_size_4x4, col_start_4x4,
                         partition_info, motion_vector_info);
-  store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4 + subblock_square_size_4x4,
                         col_start_4x4 + subblock_square_size_4x4,
@@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       vp9_zero(pc_tree->mv);
     }
     if (bsize > BLOCK_8X8) {  // Store MV result as reference for subblocks.
-      for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+      for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv;
     }
   }
 
@@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_SPLIT
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
-  pc_tree->split[0]->none.rdcost = 0;
-  pc_tree->split[1]->none.rdcost = 0;
-  pc_tree->split[2]->none.rdcost = 0;
-  pc_tree->split[3]->none.rdcost = 0;
+  pc_tree->u.split[0]->none.rdcost = 0;
+  pc_tree->u.split[1]->none.rdcost = 0;
+  pc_tree->u.split[2]->none.rdcost = 0;
+  pc_tree->u.split[3]->none.rdcost = 0;
   if (do_split || must_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     load_pred_mv(x, ctx);
     if (bsize == BLOCK_8X8) {
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+        pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+                       pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist);
       if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-          const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
-          const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+          const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0];
+          const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1];
           for (i = 0; i < 4; ++i) {
             ref_frames_used[i] |= (1 << ref1);
             if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
@@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
 
-        pc_tree->split[i]->index = i;
+        pc_tree->u.split[i]->index = i;
         if (cpi->sf.prune_ref_frame_for_rect_partitions)
-          pc_tree->split[i]->none.rate = INT_MAX;
+          pc_tree->u.split[i]->none.rate = INT_MAX;
         found_best_rd = rd_pick_partition(
             cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-            &this_rdc, best_rdc_split, pc_tree->split[i]);
+            &this_rdc, best_rdc_split, pc_tree->u.split[i]);
 
         if (found_best_rd == 0) {
           sum_rdc.rdcost = INT64_MAX;
           break;
         } else {
           if (cpi->sf.prune_ref_frame_for_rect_partitions &&
-              pc_tree->split[i]->none.rate != INT_MAX) {
-            const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
-            const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+              pc_tree->u.split[i]->none.rate != INT_MAX) {
+            const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0];
+            const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1];
             ref_frames_used[i] |= (1 << ref1);
             if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
           }
@@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       }
       break;
     case PARTITION_SPLIT: {
-      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]);
       fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
-                        pc_tree->split[1]);
+                        pc_tree->u.split[1]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
-                        pc_tree->split[2]);
+                        pc_tree->u.split[2]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
-                        pc_tree->split[3]);
+                        pc_tree->u.split[3]);
       break;
     }
     default: break;
@@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   if (bsize > BLOCK_8X8) {
     BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
     int i;
-    for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->u.split[i], subsize);
   }
 }
 
@@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                           mi_col + x_idx, subsize, &this_rdc, 0,
-                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+      nonrd_pick_partition(
+          cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+          &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]);
 
       if (this_rdc.rate == INT_MAX) {
         vp9_rd_cost_reset(&sum_rdc);
@@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         subsize = get_subsize(bsize, PARTITION_SPLIT);
         nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                subsize, output_enabled, rd_cost,
-                               pc_tree->split[0]);
+                               pc_tree->u.split[0]);
         nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                                mi_col + hbs, subsize, output_enabled, &this_rdc,
-                               pc_tree->split[1]);
+                               pc_tree->u.split[1]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                                mi_row + hbs, mi_col, subsize, output_enabled,
-                               &this_rdc, pc_tree->split[2]);
+                               &this_rdc, pc_tree->u.split[2]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                                mi_row + hbs, mi_col + hbs, subsize,
-                               output_enabled, &this_rdc, pc_tree->split[3]);
+                               output_enabled, &this_rdc, pc_tree->u.split[3]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
-                            subsize, pc_tree->leaf_split[0]);
+                            subsize, pc_tree->u.leaf_split[0]);
         encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
-                    subsize, pc_tree->leaf_split[0]);
+                    subsize, pc_tree->u.leaf_split[0]);
       } else {
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
-                            output_enabled, dummy_cost, pc_tree->split[0]);
+                            output_enabled, dummy_cost, pc_tree->u.split[0]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                             mi_col + hbs, subsize, output_enabled, dummy_cost,
-                            pc_tree->split[1]);
+                            pc_tree->u.split[1]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                             mi_row + hbs, mi_col, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[2]);
+                            dummy_cost, pc_tree->u.split[2]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                             mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[3]);
+                            dummy_cost, pc_tree->u.split[3]);
       }
       break;
   }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
index fd213f1e6b..3b8b5345f1 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -31,12 +31,14 @@
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_pthread.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
 #if CONFIG_VP9_POSTPROC
@@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     cpi->external_resize = 1;
   }
 
-  if (cpi->initial_width) {
-    int new_mi_size = 0;
-    vp9_set_mb_mi(cm, cm->width, cm->height);
-    new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
-    if (cm->mi_alloc_size < new_mi_size) {
-      vp9_free_context_buffers(cm);
-      vp9_free_pc_tree(&cpi->td);
-      vpx_free(cpi->mbmi_ext_base);
-      alloc_compressor_data(cpi);
-      realloc_segmentation_maps(cpi);
-      cpi->initial_width = cpi->initial_height = 0;
-      cpi->external_resize = 0;
-    } else if (cm->mi_alloc_size == new_mi_size &&
-               (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      if (vp9_alloc_loop_filter(cm)) {
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate loop filter data");
-      }
+  int new_mi_size = 0;
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    vp9_free_context_buffers(cm);
+    vp9_free_pc_tree(&cpi->td);
+    vpx_free(cpi->mbmi_ext_base);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+    cpi->external_resize = 0;
+  } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+    if (vp9_alloc_loop_filter(cm)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate loop filter data");
     }
   }
 
@@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) {
         continue;
       }
 
-#if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         RefCntBuffer *new_fb_ptr = NULL;
         int force_scaling = 0;
@@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
         new_fb_ptr = &pool->frame_bufs[new_fb];
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
+#if CONFIG_VP9_HIGHBITDEPTH
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        cm->use_highbitdepth,
@@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                "Failed to allocate frame buffer");
           scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
                                  EIGHTTAP, 0);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
 #else
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
-        int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          force_scaling = 1;
-        }
-        if (new_fb == INVALID_IDX) return;
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        VP9_ENC_BORDER_IN_PIXELS,
@@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       } else {
         int buf_idx;
         RefCntBuffer *buf = NULL;
@@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 #endif
 }
 
+static YV12_BUFFER_CONFIG *svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->bit_depth == VPX_BITS_8) {
+      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                                 phase_scaler2);
+      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+                                 phase_scaler);
+    } else {
+      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+                             filter_type2, phase_scaler2);
+      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+                             filter_type, phase_scaler);
+    }
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                               phase_scaler2);
+    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
                                       uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
@@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     // result will be saved in scaled_temp and might be used later.
     const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
     const int phase_scaler2 = svc->downsample_filter_phase[1];
-    cpi->Source = vp9_svc_twostage_scale(
+    cpi->Source = svc_twostage_scale(
         cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     svc->scaled_one_half = 1;
@@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   // external rate control model.
   // This flag doesn't have any impact when external rate control is not used.
   int ext_rc_recode = 0;
-  // Maximal frame size allowed by the external rate control.
-  // case: 0, we ignore the max frame size limit, and encode with the qindex
-  // passed in by the external rate control model.
-  // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
-  // and may recode if undershoot/overshoot is seen.
-  // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
-  // case: -1, we take libvpx's decision for the max frame size, as well as
-  // the recode decision.
-  // Otherwise: if a specific size is given, libvpx's recode decision
-  // will respect the given size.
-  int ext_rc_max_frame_size = 0;
-  // Use VP9's decision of qindex. This flag is in use only in external rate
-  // control model to help determine whether to recode when
-  // |ext_rc_max_frame_size| is 0.
-  int ext_rc_use_default_q = 1;
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
@@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+        !cpi->tpl_with_external_rc &&
         (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
         cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
-      FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-      const int ref_frame_flags = get_ref_frame_flags(cpi);
-      RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
-      const RefCntBuffer *curr_frame_buf =
-          get_ref_cnt_buffer(cm, cm->new_fb_idx);
-      // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
-      // index 1 refers to the first encoding frame in a gf group.
-      // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
-      // See function define_gf_group_structure().
-      const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
-      get_ref_frame_bufs(cpi, ref_frame_bufs);
       codec_status = vp9_extrc_get_encodeframe_decision(
-          &cpi->ext_ratectrl, curr_frame_buf->frame_index,
-          cm->current_frame_coding_index, gf_group->index, update_type,
-          gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
-          &encode_frame_decision);
+          &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
       if (codec_status != VPX_CODEC_OK) {
         vpx_internal_error(&cm->error, codec_status,
                            "vp9_extrc_get_encodeframe_decision() failed");
@@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       // libvpx's default q.
       if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
         q = encode_frame_decision.q_index;
-        ext_rc_use_default_q = 0;
       }
-      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
     vp9_set_quantizer(cpi, q);
@@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 
     if (cpi->ext_ratectrl.ready &&
         (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
-      // In general, for the external rate control, we take the qindex provided
-      // as input and encode the frame with this qindex faithfully. However,
-      // in some extreme scenarios, the provided qindex leads to a massive
-      // overshoot of frame size. In this case, we fall back to VP9's decision
-      // to pick a new qindex and recode the frame. We return the new qindex
-      // through the API to the external model.
-      if (ext_rc_max_frame_size == 0) {
-        if (!ext_rc_use_default_q) break;
-      } else if (ext_rc_max_frame_size == -1) {
-        // Do nothing, fall back to libvpx's recode decision.
-      } else {
-        // Change the max frame size, used in libvpx's recode decision.
-        rc->max_frame_bandwidth = ext_rc_max_frame_size;
-      }
-      ext_rc_recode = 1;
+      break;
     }
 #if CONFIG_RATE_CTRL
     if (cpi->oxcf.use_simple_encode_api) {
@@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) {
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
-    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
-    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
-    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
-  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
-      cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (cm->bit_depth == VPX_BITS_8) {
-      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
-                                 phase_scaler2);
-      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
-                                 phase_scaler);
-    } else {
-      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
-                             filter_type2, phase_scaler2);
-      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
-                             filter_type, phase_scaler);
-    }
-#else
-    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
-                               phase_scaler2);
-    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    return scaled;
-  } else {
-    return unscaled;
-  }
-}
-
 YV12_BUFFER_CONFIG *vp9_scale_if_required(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
@@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
+    if (!(cpi->ext_ratectrl.ready &&
+          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+          cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) {
+      // This assert only makes sense when not using external RC.
+      assert(arf_src_index <= rc->frames_to_key);
+    }
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
       cpi->alt_ref_source = source;
 
@@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
     vp9_init_tpl_buffer(cpi);
-    vp9_estimate_qp_gop(cpi);
+    vp9_estimate_tpl_qp_gop(cpi);
     vp9_setup_tpl_stats(cpi);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
index 91df538821..898855d10d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -25,6 +25,7 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 
@@ -1062,7 +1063,7 @@ typedef struct VP9_COMP {
    */
   uint64_t frame_component_time[kTimingComponents];
 #endif
-  // Flag to indicate if QP and GOP for TPL is controlled by external RC.
+  // Flag to indicate if QP and GOP for TPL are controlled by external RC.
   int tpl_with_external_rc;
 } VP9_COMP;
 
@@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                              YV12_BUFFER_CONFIG *dst);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
-    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
-    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
-    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
-
 YV12_BUFFER_CONFIG *vp9_scale_if_required(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
index a8d1cb7a7a..c3b79507e6 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -17,6 +17,7 @@
 #include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_pthread.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   int i, j, k, l, m, n;
@@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) {
     vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
-  return 0;
+  return 1;
 }
 
 static int get_max_tile_cols(VP9_COMP *cpi) {
@@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
 
     ++cpi->num_workers;
     winterface->init(worker);
+    worker->thread_name = "vpx enc worker";
 
     if (i < num_workers - 1) {
       thread_data->cpi = cpi;
@@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   create_enc_workers(cpi, num_workers);
 
   for (i = 0; i < num_workers; i++) {
-    EncWorkerData *thread_data;
-    thread_data = &cpi->tile_thr_data[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) {
                                         this_tile, &best_ref_mv, mb_row);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
@@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) {
                                         mb_col_start, mb_col_end);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
@@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
       vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
index 4c192da515..359cdd1290 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -11,13 +11,14 @@
 #ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
 #define VPX_VP9_ENCODER_VP9_ETHREAD_H_
 
+#include "vpx_util/vpx_pthread.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define MAX_NUM_TILE_COLS (1 << 6)
 #define MAX_NUM_TILE_ROWS 4
-#define MAX_NUM_THREADS 80
 
 struct VP9_COMP;
 struct ThreadData;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
index 4664e8c5e2..7b0d89acd2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 }
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
-  if (ext_ratectrl == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
-    vpx_rc_status_t rc_status;
-    vpx_rc_encodeframe_info_t encode_frame_info;
-    encode_frame_info.show_index = show_index;
-    encode_frame_info.coding_index = coding_index;
-    encode_frame_info.gop_index = gop_index;
-    encode_frame_info.frame_type = extrc_get_frame_type(update_type);
-    encode_frame_info.gop_size = gop_size;
-    encode_frame_info.use_alt_ref = use_alt_ref;
-
-    vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
-                           encode_frame_info.ref_frame_coding_indexes,
-                           encode_frame_info.ref_frame_valid_list);
+  assert(ext_ratectrl != NULL);
+  assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0);
 
-    rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
-        ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
-    if (rc_status == VPX_RC_ERROR) {
-      return VPX_CODEC_ERROR;
-    }
+  vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+      ext_ratectrl->model, gop_index, encode_frame_decision);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
   }
   return VPX_CODEC_OK;
 }
@@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
 }
 
 vpx_codec_err_t vp9_extrc_get_gop_decision(
-    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
+    EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) {
   vpx_rc_status_t rc_status;
   if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
       (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
-                                                   gop_info, gop_decision);
-  if (gop_decision->use_alt_ref) {
-    const int arf_constraint =
-        gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
-        gop_decision->gop_coding_frames < gop_info->lag_in_frames;
-    if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
-  }
-  // TODO(chengchen): Take min and max gf interval from the model
-  // and overwrite libvpx's decision so that we can get rid
-  // of one of the checks here.
-  if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
-          gop_info->max_gf_interval) {
-    return VPX_CODEC_ERROR;
-  }
+  rc_status =
+      ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision);
   if (rc_status == VPX_RC_ERROR) {
     return VPX_CODEC_ERROR;
   }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
index b04580c1d4..d1be5f2aef 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
                                          const VpxTplGopStats *tpl_gop_stats);
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
 vpx_codec_err_t vp9_extrc_update_encodeframe_result(
@@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
     uint32_t input_bit_depth, const int actual_encoding_qindex);
 
-vpx_codec_err_t vp9_extrc_get_gop_decision(
-    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
-    vpx_rc_gop_decision_t *gop_decision);
+vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl,
+                                           vpx_rc_gop_decision_t *gop_decision);
 
 vpx_codec_err_t vp9_extrc_get_frame_rdmult(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
index dcb62e8768..69261ac65f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                         dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv, chroma_step);
 }
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw) {
-  // If the side is not touching the bounder then don't extend.
-  const int et_y = srcy ? 0 : dst->border;
-  const int el_y = srcx ? 0 : dst->border;
-  const int eb_y = srcy + srch != src->y_height
-                       ? 0
-                       : dst->border + dst->y_height - src->y_height;
-  const int er_y = srcx + srcw != src->y_width
-                       ? 0
-                       : dst->border + dst->y_width - src->y_width;
-  const int src_y_offset = srcy * src->y_stride + srcx;
-  const int dst_y_offset = srcy * dst->y_stride + srcx;
-
-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
-  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
-  // detect nv12 colorspace
-  const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
-                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
-                        et_y, el_y, eb_y, er_y, 1);
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
-                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
-                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
index 4ba7fc95e3..21d7e68b9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -21,9 +21,6 @@ extern "C" {
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst);
 
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
index a9cdf5353f..58b9b7ba61 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -37,6 +37,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/variance.h"
 
 #define OUTPUT_FPF 0
@@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         v_fn_ptr.vf = get_block_variance_fn(bsize);
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         this_motion_error =
@@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     }
   }
 #endif
-  // If the external rate control model for GOP is used, the gop decisions
-  // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
-  // will be overwritten.
-  if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
-      cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
-    vpx_codec_err_t codec_status;
-    vpx_rc_gop_decision_t gop_decision;
-    vpx_rc_gop_info_t gop_info;
-    gop_info.min_gf_interval = rc->min_gf_interval;
-    gop_info.max_gf_interval = rc->max_gf_interval;
-    gop_info.active_min_gf_interval = active_gf_interval.min;
-    gop_info.active_max_gf_interval = active_gf_interval.max;
-    gop_info.allow_alt_ref = allow_alt_ref;
-    gop_info.is_key_frame = is_key_frame;
-    gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
-    gop_info.frames_since_key = rc->frames_since_key;
-    gop_info.frames_to_key = rc->frames_to_key;
-    gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
-    gop_info.show_index = cm->current_video_frame;
-    gop_info.coding_index = cm->current_frame_coding_index;
-    gop_info.gop_global_index = rc->gop_global_index;
-
-    codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
-                                              &gop_decision);
-    if (codec_status != VPX_CODEC_OK) {
-      vpx_internal_error(&cm->error, codec_status,
-                         "vp9_extrc_get_gop_decision() failed");
-    }
-    gop_coding_frames = gop_decision.gop_coding_frames;
-    use_alt_ref = gop_decision.use_alt_ref;
-  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   else
     twopass->fr_content_type = FC_NORMAL;
 
-  // Keyframe and section processing.
-  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
-    // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, show_idx);
+  // If the external rate control model for GOP is used, the gop decisions
+  // are overwritten, including whether to use key frame in this GF group,
+  // GF group length, and whether to use arf.
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL &&
+      rc->frames_till_gf_update_due == 0) {
+    vpx_codec_err_t codec_status;
+    vpx_rc_gop_decision_t gop_decision;
+    codec_status =
+        vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    if (gop_decision.use_key_frame) {
+      cpi->common.frame_type = KEY_FRAME;
+      rc->frames_since_key = 0;
+      // Clear the alt ref active flag and last group multi arf flags as they
+      // can never be set for a key frame.
+      rc->source_alt_ref_active = 0;
+      // KF is always a GF so clear frames till next gf counter.
+      rc->frames_till_gf_update_due = 0;
+    }
+
+    // A new GF group
+    if (rc->frames_till_gf_update_due == 0) {
+      vp9_zero(twopass->gf_group);
+      ++rc->gop_global_index;
+      if (gop_decision.use_alt_ref) {
+        rc->source_alt_ref_pending = 1;
+      }
+      rc->baseline_gf_interval =
+          gop_decision.gop_coding_frames - rc->source_alt_ref_pending;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      define_gf_group_structure(cpi);
+    }
   } else {
-    cm->frame_type = INTER_FRAME;
-  }
+    // Keyframe and section processing.
+    if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+      // Define next KF group and assign bits to it.
+      find_next_key_frame(cpi, show_idx);
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
 
-  // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, show_idx);
+    // Define a new GF/ARF group. (Should always enter here for key frames).
+    if (rc->frames_till_gf_update_due == 0) {
+      define_gf_group(cpi, show_idx);
 
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
 #if ARF_STATS_OUTPUT
-    {
-      FILE *fpfile;
-      fpfile = fopen("arf.stt", "a");
-      ++arf_count;
-      fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
-              cm->current_video_frame, rc->frames_till_gf_update_due,
-              rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
-
-      fclose(fpfile);
-    }
+      {
+        FILE *fpfile;
+        fpfile = fopen("arf.stt", "a");
+        ++arf_count;
+        fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+                cm->current_video_frame, rc->frames_till_gf_update_due,
+                rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
+
+        fclose(fpfile);
+      }
 #endif
+    }
   }
 
   vp9_configure_buffer_updates(cpi, gf_group->index);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
index 97838c38e6..b6be4f88ac 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -9,6 +9,7 @@
  */
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vpx_config.h"
 
@@ -81,7 +82,6 @@ bail:
   return NULL;
 }
 
-#define USE_PARTIAL_COPY 0
 int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
   return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
 }
@@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        vpx_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
-  int row, col, active_end;
-  int mb_rows = (src->y_height + 15) >> 4;
-  int mb_cols = (src->y_width + 15) >> 4;
-#endif
   int width = src->y_crop_width;
   int height = src->y_crop_height;
   int uv_width = src->uv_crop_width;
@@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                    height != buf->img.y_crop_height ||
                    uv_width != buf->img.uv_crop_width ||
                    uv_height != buf->img.uv_crop_height;
-  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
-                      uv_width > buf->img.uv_width ||
-                      uv_height > buf->img.uv_height;
+  larger_dimensions =
+      width > buf->img.y_crop_width || height > buf->img.y_crop_height ||
+      uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height;
   assert(!larger_dimensions || new_dimensions);
 
-#if USE_PARTIAL_COPY
-  // TODO(jkoleszar): This is disabled for now, as
-  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
-  // Only do this partial copy if the following conditions are all met:
-  // 1. Lookahead queue has has size of 1.
-  // 2. Active map is provided.
-  // 3. This is not a key frame, golden nor altref frame.
-  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
-    for (row = 0; row < mb_rows; ++row) {
-      col = 0;
-
-      while (1) {
-        // Find the first active macroblock in this row.
-        for (; col < mb_cols; ++col) {
-          if (active_map[col]) break;
-        }
-
-        // No more active macroblock in this row.
-        if (col == mb_cols) break;
-
-        // Find the end of active region in this row.
-        active_end = col;
-
-        for (; active_end < mb_cols; ++active_end) {
-          if (!active_map[active_end]) break;
-        }
-
-        // Only copy this active region.
-        vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
-                                            16, (active_end - col) << 4);
-
-        // Start again from the end of this active region.
-        col = active_end;
-      }
-
-      active_map += mb_cols;
-    }
-  } else {
-#endif
-    if (larger_dimensions) {
-      YV12_BUFFER_CONFIG new_img;
-      memset(&new_img, 0, sizeof(new_img));
-      if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y,
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 use_highbitdepth,
+                               use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, 0))
-        return 1;
-      vpx_free_frame_buffer(&buf->img);
-      buf->img = new_img;
-    } else if (new_dimensions) {
-      buf->img.y_crop_width = src->y_crop_width;
-      buf->img.y_crop_height = src->y_crop_height;
-      buf->img.uv_crop_width = src->uv_crop_width;
-      buf->img.uv_crop_height = src->uv_crop_height;
-      buf->img.subsampling_x = src->subsampling_x;
-      buf->img.subsampling_y = src->subsampling_y;
-    }
-    // Partial copy not implemented yet
-    vp9_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+                               VP9_ENC_BORDER_IN_PIXELS, 0))
+      return 1;
+    vpx_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    buf->img.y_width = src->y_width;
+    buf->img.y_height = src->y_height;
+    buf->img.uv_width = src->uv_width;
+    buf->img.uv_height = src->uv_height;
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
   }
-#endif
+  vp9_copy_and_extend_frame(src, &buf->img);
 
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
index 0843cd97e4..6e124f9944 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "vpx_util/vpx_pthread.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_multi_thread.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
index 3f4fe6957b..d37e020b0a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
index 62d6b93028..76d5435e60 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -35,6 +35,7 @@
 #include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
 
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_ext_ratectrl.h"
@@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
   return q;
 }
 
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
-                                         int *top_index, int gf_group_index) {
+int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                      int *top_index, int gf_group_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
         q = active_worst_quality;
     }
   }
-  clamp(q, active_best_quality, active_worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
     else
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
-                                      gf_group_index);
+    q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+                                          gf_group_index);
   }
   if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
   }
 }
 
-void vp9_estimate_qp_gop(VP9_COMP *cpi) {
-  int gop_length = cpi->twopass.gf_group.gf_group_size;
-  int bottom_index, top_index;
-  int idx;
-  const int gf_index = cpi->twopass.gf_group.index;
-  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
-  const int refresh_frame_context = cpi->common.refresh_frame_context;
-
-  for (idx = 1; idx <= gop_length; ++idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
-    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
-    cpi->twopass.gf_group.index = idx;
-    vp9_rc_set_frame_target(cpi, target_rate);
-    vp9_configure_buffer_updates(cpi, idx);
-    if (cpi->tpl_with_external_rc) {
-      if (cpi->ext_ratectrl.ready &&
-          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
-          cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
-        VP9_COMMON *cm = &cpi->common;
-        vpx_codec_err_t codec_status;
-        const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-        vpx_rc_encodeframe_decision_t encode_frame_decision;
-        FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-        RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
-        const RefCntBuffer *curr_frame_buf =
-            get_ref_cnt_buffer(cm, cm->new_fb_idx);
-        // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
-        // index 1 refers to the first encoding frame in a gf group.
-        // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
-        // See function define_gf_group_structure().
-        const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
-        const int frame_coding_index = cm->current_frame_coding_index + idx - 1;
-        get_ref_frame_bufs(cpi, ref_frame_bufs);
-        codec_status = vp9_extrc_get_encodeframe_decision(
-            &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index,
-            gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref,
-            ref_frame_bufs, 0 /*ref_frame_flags is not used*/,
-            &encode_frame_decision);
-        if (codec_status != VPX_CODEC_OK) {
-          vpx_internal_error(&cm->error, codec_status,
-                             "vp9_extrc_get_encodeframe_decision() failed");
-        }
-        tpl_frame->base_qindex = encode_frame_decision.q_index;
-      }
-    } else {
-      tpl_frame->base_qindex =
-          rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
-      tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
-    }
-  }
-  // Reset the actual index and frame update
-  cpi->twopass.gf_group.index = gf_index;
-  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
-  cpi->common.refresh_frame_context = refresh_frame_context;
-  vp9_configure_buffer_updates(cpi, gf_index);
-}
-
 void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
@@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
     }
     // For temporal layers, reset the rate control parametes across all
-    // temporal layers. If the first_spatial_layer_to_encode > 0, then this
-    // superframe has skipped lower base layers. So in this case we should also
-    // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
+    // temporal layers.
+    // If the first_spatial_layer_to_encode > 0, then this superframe has
+    // skipped lower base layers. So in this case we should also reset and
+    // force max-q for spatial layers < first_spatial_layer_to_encode.
+    // For the case of no inter-layer prediction on delta frames: reset and
+    // force max-q for all spatial layers, to avoid excessive frame drops.
     if (cpi->use_svc) {
       int tl = 0;
       int sl = 0;
       SVC *svc = &cpi->svc;
-      for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
+      int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode);
+      if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON)
+        num_spatial_layers = svc->number_spatial_layers;
+      for (sl = 0; sl < num_spatial_layers; ++sl) {
         for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
           const int layer =
               LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
index 48c49e937e..0c61ad3461 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
 void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
 
-void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
-
 void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
 
 void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
 
+int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi,
+                                      int *bottom_index, int *top_index,
+                                      int gf_group_index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
index 974e43c90f..447136ed84 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
-static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) {
   if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
     int_mv cur_fullpel_mv, prev_fullpel_mv;
     cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
index b8910370e0..048ab8732d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -18,9 +18,12 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_tpl_model.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 
 static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                            const GF_GROUP *gf_group, int *tpl_group_frames) {
@@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
       tpl_block_stats_ptr->col = mi_col * 8;
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
-      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
-      tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+      // inter/intra_cost here is calculated with SATD which should be close
+      // enough to be used as inter/intra_pred_error
+      tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost;
+      tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
       tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
       tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
       tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
@@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
   tpl_stats->intra_cost = VPXMAX(
       1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  if (best_rf_idx >= 0) {
+    tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  }
   tpl_stats->mv.as_int = best_mv.as_int;
   *ref_frame_idx = best_rf_idx;
 }
@@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
 }
 #endif  // CONFIG_RATE_CTRL
 
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) {
+  int gop_length = cpi->twopass.gf_group.gf_group_size;
+  int bottom_index, top_index;
+  int idx;
+  const int gf_index = cpi->twopass.gf_group.index;
+  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+  const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+  for (idx = 1; idx <= gop_length; ++idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+    cpi->twopass.gf_group.index = idx;
+    vp9_rc_set_frame_target(cpi, target_rate);
+    vp9_configure_buffer_updates(cpi, idx);
+    if (cpi->tpl_with_external_rc) {
+      VP9_COMMON *cm = &cpi->common;
+      if (cpi->ext_ratectrl.ready &&
+          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+          cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+        vpx_codec_err_t codec_status;
+        const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+        vpx_rc_encodeframe_decision_t encode_frame_decision;
+        codec_status = vp9_extrc_get_encodeframe_decision(
+            &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision);
+        if (codec_status != VPX_CODEC_OK) {
+          vpx_internal_error(&cm->error, codec_status,
+                             "vp9_extrc_get_encodeframe_decision() failed");
+        }
+        tpl_frame->base_qindex = encode_frame_decision.q_index;
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                           "The external rate control library is not set "
+                           "properly for TPL pass.");
+      }
+    } else {
+      tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass(
+          cpi, &bottom_index, &top_index, idx);
+      tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+    }
+  }
+  // Reset the actual index and frame update
+  cpi->twopass.gf_group.index = gf_index;
+  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+  cpi->common.refresh_frame_context = refresh_frame_context;
+  vp9_configure_buffer_updates(cpi, gf_index);
+}
+
 void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
     mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
   }
 
-  // TPL stats has extra frames from next GOP. Trim those extra frames for
-  // Qmode.
-  trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
-
   if (cpi->ext_ratectrl.ready &&
       cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+    // Intra search on key frame
+    if (gf_picture[0].update_type == KF_UPDATE) {
+      mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize);
+    }
+    // TPL stats has extra frames from next GOP. Trim those extra frames for
+    // Qmode.
+    trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats,
+                   extended_frame_count);
     const vpx_codec_err_t codec_status =
         vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
     if (codec_status != VPX_CODEC_OK) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
index 04beb22610..de0ac39a1f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -31,6 +31,7 @@ typedef struct GF_PICTURE {
 void vp9_init_tpl_buffer(VP9_COMP *cpi);
 void vp9_setup_tpl_stats(VP9_COMP *cpi);
 void vp9_free_tpl_buffer(VP9_COMP *cpi);
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi);
 
 void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
                       TX_SIZE tx_size);
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 94506aad0f..628dc4fead 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
       scale_plane_1_to_2_phase_0(
           src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
           src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
-      scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
-                                 dst->uv_stride, src_w / 2, src_h / 2,
-                                 vp9_filter_kernels[filter_type][8],
-                                 temp_buffer);
-      scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
-                                 dst->uv_stride, src_w / 2, src_h / 2,
-                                 vp9_filter_kernels[filter_type][8],
-                                 temp_buffer);
+      const int src_uv_w = src->uv_crop_width;
+      const int src_uv_h = src->uv_crop_height;
+      scale_plane_1_to_2_phase_0(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
       free(temp_buffer);
     }
   }
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
index fd81bce7b5..942c15ce49 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@@ -12,10 +12,12 @@
 #include <new>
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
 
 namespace libvpx {
 
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
index 85005c5474..4c39255886 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@@ -12,43 +12,34 @@
 #define VPX_VP9_RATECTRL_RTC_H_
 
 #include <cstdint>
+#include <cstring>
+#include <limits>
 #include <memory>
 
-#include "vp9/common/vp9_enums.h"
-#include "vp9/vp9_iface_common.h"
-#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/vp9_cx_iface.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
-#include "vpx_mem/vpx_mem.h"
 
 struct VP9_COMP;
 
 namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
   VP9RateControlRtcConfig() {
-    ss_number_layers = 1;
-    vp9_zero(max_quantizers);
-    vp9_zero(min_quantizers);
-    vp9_zero(scaling_factor_den);
-    vp9_zero(scaling_factor_num);
-    vp9_zero(layer_target_bitrate);
-    vp9_zero(ts_rate_decimator);
+    memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator));
     scaling_factor_num[0] = 1;
     scaling_factor_den[0] = 1;
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
-    max_consec_drop = INT_MAX;
   }
 
   // Number of spatial layers
-  int ss_number_layers;
-  int max_quantizers[VPX_MAX_LAYERS];
-  int min_quantizers[VPX_MAX_LAYERS];
-  int scaling_factor_num[VPX_SS_MAX_LAYERS];
-  int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int ss_number_layers = 1;
+  int max_quantizers[VPX_MAX_LAYERS] = {};
+  int min_quantizers[VPX_MAX_LAYERS] = {};
+  int scaling_factor_num[VPX_SS_MAX_LAYERS] = {};
+  int scaling_factor_den[VPX_SS_MAX_LAYERS] = {};
   // This is only for SVC for now.
-  int max_consec_drop;
+  int max_consec_drop = std::numeric_limits<int>::max();
 };
 
 struct VP9FrameParamsQpRTC {
@@ -105,9 +96,9 @@ class VP9RateControlRTC {
                         const VP9FrameParamsQpRTC &frame_params);
 
  private:
-  VP9RateControlRTC() {}
+  VP9RateControlRTC() = default;
   bool InitRateControl(const VP9RateControlRtcConfig &cfg);
-  struct VP9_COMP *cpi_;
+  struct VP9_COMP *cpi_ = nullptr;
 };
 
 }  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc
index 2e6f9a4513..5e565d1b1a 100644
--- a/media/libvpx/libvpx/vp9/simple_encode.cc
+++ b/media/libvpx/libvpx/vp9/simple_encode.cc
@@ -8,8 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
+#include <stdlib.h>
+
 #include <memory>
 #include <vector>
+
 #include "./ivfenc.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_enums.h"
@@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() {
   use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
 #endif
   vpx_image_t img;
+  if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+    fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+    abort();
+  }
   vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
   rewind(in_file_);
   impl_ptr_->first_pass_stats.clear();
@@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() {
   vp9_set_first_pass_stats(&oxcf, &stats);
   assert(impl_ptr_->cpi == nullptr);
   impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+  if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+    fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+    abort();
+  }
   vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_,
                 frame_height_, 1);
 
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
index 8df04f29f0..fe62bac5f2 100644
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -17,6 +19,7 @@
 #include "vpx_dsp/psnr.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
@@ -110,7 +113,6 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp9_extracfg extra_cfg;
-  vpx_rational64_t timestamp_ratio;
   vpx_codec_pts_t pts_offset;
   unsigned char pts_offset_initialized;
   VP9EncoderConfig oxcf;
@@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
-  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 
     if (res == VPX_CODEC_OK) {
       priv->pts_offset_initialized = 0;
-      // TODO(angiebird): Replace priv->timestamp_ratio by
-      // oxcf->g_timebase_in_ts
-      priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase);
-
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
 #if CONFIG_VP9_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
@@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   MODE new_mode = BEST;
 
 #if CONFIG_REALTIME_ONLY
@@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
         // Convert duration parameter from stream timebase to microseconds.
-        uint64_t duration_us;
-
         VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
                           (TICKS_PER_SEC % 1000000) == 0);
 
-        duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
-                      (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+        if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) {
+          ERROR("duration is too big");
+        }
+        uint64_t duration_us = duration *
+                               (uint64_t)ctx->oxcf.g_timebase_in_ts.num /
+                               ((uint64_t)ctx->oxcf.g_timebase_in_ts.den *
+                                (TICKS_PER_SEC / 1000000));
 
         // If the deadline is more that the duration this frame is to be shown,
         // use good quality mode. Otherwise use realtime mode.
@@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 // Turn on to test if supplemental superframe data breaks decoding
@@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
                            .is_key_frame))
     flags |= VPX_FRAME_IS_KEY;
 
+  if (!cpi->common.show_frame) {
+    flags |= VPX_FRAME_IS_INVISIBLE;
+  }
+
   if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
 
   return flags;
@@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
   volatile vpx_enc_frame_flags_t flags = enc_flags;
   volatile vpx_codec_pts_t pts = pts_val;
   VP9_COMP *const cpi = ctx->cpi;
-  const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+  const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts;
   size_t data_sz;
   vpx_codec_cx_pkt_t pkt;
   memset(&pkt, 0, sizeof(pkt));
@@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = pts;
-    ctx->pts_offset_initialized = 1;
+  res = pick_quickcompress_mode(ctx, duration, deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
   }
-  pts -= ctx->pts_offset;
-
-  pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // Handle Flags
@@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (res == VPX_CODEC_OK) {
     unsigned int lib_flags = 0;
-    YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
-    cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1);
-    cpi->svc.time_stamp_superframe = dst_time_stamp;
-
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts < ctx->pts_offset) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts -= ctx->pts_offset;
+      if (pts > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      const int64_t dst_time_stamp =
+          timebase_units_to_ticks(timebase_in_ts, pts);
+
+      cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1);
+      cpi->svc.time_stamp_superframe = dst_time_stamp;
+
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts + (int64_t)duration;
+      if (pts_end > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
       const int64_t dst_end_time_stamp =
-          timebase_units_to_ticks(timestamp_ratio, pts + duration);
+          timebase_units_to_ticks(timebase_in_ts, pts_end);
       res = image2yuvconfig(img, &sd);
 
       if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       if (cx_data_sz < ctx->cx_data_sz / 2) {
         vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
                            "Compressed data buffer too small");
-        return VPX_CODEC_ERROR;
       }
     }
 
@@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // compute first pass stats
       if (img) {
         int ret;
+        int64_t dst_time_stamp;
         int64_t dst_end_time_stamp;
         vpx_codec_cx_pkt_t fps_pkt;
         ENCODE_FRAME_RESULT encode_frame_result;
@@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 #endif  // !CONFIG_REALTIME_ONLY
     } else {
       ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_time_stamp;
       int64_t dst_end_time_stamp;
       vp9_init_encode_frame_result(&encode_frame_result);
       while (cx_data_sz >= ctx->cx_data_sz / 2 &&
@@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
             if (ctx->output_cx_pkt_cb.output_cx_pkt) {
               pkt.kind = VPX_CODEC_CX_FRAME_PKT;
               pkt.data.frame.pts =
-                  ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+                  ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
                   ctx->pts_offset;
               pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-                  timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+                  timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
               pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
               pkt.data.frame.buf = ctx->pending_cx_data;
               pkt.data.frame.sz = size;
@@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           // Add the frame packet to the list of returned packets.
           pkt.kind = VPX_CODEC_CX_FRAME_PKT;
           pkt.data.frame.pts =
-              ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+              ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
               ctx->pts_offset;
           pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-              timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+              timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
           pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
           pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
           pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
@@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
     ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
     ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+    ratectrl_config.base_qp = oxcf->cq_level;
 
     if (oxcf->rc_mode == VPX_VBR) {
       ratectrl_config.rc_mode = VPX_RC_VBR;
@@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height,
   return enc_cfg;
 }
 
-static vp9_extracfg get_extra_cfg() {
+static vp9_extracfg get_extra_cfg(void) {
   vp9_extracfg extra_cfg = default_extra_cfg;
   return extra_cfg;
 }
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
index 860f721dc5..7567910b9b 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
@@ -19,7 +19,6 @@
 #include "vpx/vpx_decoder.h"
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk
index 44790ef6a4..7a0e2d8d1f 100644
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@@ -140,6 +140,7 @@ endif
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+VP9_CX_SRCS-$(HAVE_SVE)  += encoder/arm/neon/vp9_error_sve.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
index 01d64b14b7..2643b5578a 100644
--- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -22,8 +22,14 @@ enum class FrameDropDecision {
   kDrop,  // Frame is dropped.
 };
 
+struct UVDeltaQP {
+  // For the UV channel: the QP for the dc/ac value is given as
+  // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers.
+  int uvdc_delta_q;
+  int uvac_delta_q;
+};
+
 struct VpxRateControlRtcConfig {
- public:
   VpxRateControlRtcConfig() {
     width = 1280;
     height = 720;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
index 017525aeee..001d854abe 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
@@ -14,6 +14,7 @@
  */
 #include <assert.h>
 #include <limits.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vp8/common/blockd.h"
@@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
   while (0)
 
 #else
-static void FLOATING_POINT_INIT() {}
-static void FLOATING_POINT_RESTORE() {}
+static void FLOATING_POINT_INIT(void) {}
+static void FLOATING_POINT_RESTORE(void) {}
 #endif
 
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
@@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
     res = VPX_CODEC_ERROR;
   else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
+#if ULONG_MAX > UINT32_MAX
+  else if (duration > UINT32_MAX || deadline > UINT32_MAX)
+    res = VPX_CODEC_INVALID_PARAM;
+#endif
   else {
     unsigned int num_enc = ctx->priv->enc.total_encoders;
 
diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c
index f9f0dd6025..3f7ff74244 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_image.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_image.c
@@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
 
+  if (fmt == VPX_IMG_FMT_NONE) goto fail;
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   /* Get chroma shift values for this format */
   // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
-  // one time.
+  // once.
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
index 62c2a9c857..b0687a8135 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
@@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
                   "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
                   " %" PRId64 " %d\n",
                   block_stats.inter_cost, block_stats.intra_cost,
-                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
-                  block_stats.recrf_rate, block_stats.ref_frame_index));
+                  block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist,
+                  block_stats.srcrf_rate, block_stats.ref_frame_index));
     }
   }
 
@@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
                  " %" SCNd64 " %d\n",
                  &block_stats->inter_cost, &block_stats->intra_cost,
                  &block_stats->mv_c, &block_stats->mv_r,
-                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->srcrf_dist, &block_stats->srcrf_rate,
                  &block_stats->ref_frame_index),
           7);
     }
diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h
index b12938d3d8..dfdbb3c770 100644
--- a/media/libvpx/libvpx/vpx/vp8cx.h
+++ b/media/libvpx/libvpx/vpx/vp8cx.h
@@ -772,6 +772,8 @@ enum vp8e_enc_control_id {
   /*!\brief Codec control to use external RC to control TPL.
    *
    * This will use external RC to control the QP and GOP structure for TPL.
+   * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero.
+   * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set.
    *
    * Supported in codecs: VP9
    */
diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h
index 18e3862bd7..809a097d94 100644
--- a/media/libvpx/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_encoder.h
@@ -31,7 +31,6 @@ extern "C" {
 
 #include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_ext_ratectrl.h"
-#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,10 +56,15 @@ extern "C" {
  * must be bumped.  Examples include, but are not limited to, changing
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
+ *
+ * \note
+ * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component
+ * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses
+ * vpx_rc_funcs_t.
  */
-#define VPX_ENCODER_ABI_VERSION                                \
-  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
-   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+  (18 + VPX_CODEC_ABI_VERSION + \
+   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
  *     The buffer was set successfully.
  * \retval #VPX_CODEC_INVALID_PARAM
  *     A parameter was NULL, the image format is unsupported, etc.
+ *
+ * \note
+ * `duration` and `deadline` are of the unsigned long type, which can be 32
+ * or 64 bits. `duration` and `deadline` must be less than or equal to
+ * UINT32_MAX so that their ranges are independent of the size of unsigned
+ * long.
  */
 vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
                                           const vpx_fixed_buf_t *buf,
diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
index 46d290dff4..ba12e4f83b 100644
--- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
@@ -26,7 +26,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (7)
+#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t;
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
- *
- * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
- *
- * If max_frame_size = 0, the encoding ignores max frame size limit.
- * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
- * If the encoded frame size is larger than max_frame_size, the frame is
- * recoded to meet the size limit, following VP9's recoding principles.
  */
 typedef struct vpx_rc_encodeframe_decision {
-  int q_index;        /**< Quantizer step index [0..255]*/
-  int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
+  int q_index; /**< Quantizer step index [0..255]*/
+  int rdmult;  /**< Frame level Lagrangian multiplier*/
 } vpx_rc_encodeframe_decision_t;
 
 /*!\brief Information for the frame to be encoded.
@@ -322,6 +315,7 @@ typedef struct vpx_rc_config {
   vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
   int overshoot_percent;     /**< for VBR mode only */
   int undershoot_percent;    /**< for VBR mode only */
+  int base_qp;               /**< base QP for leaf frames, 0-255 */
 } vpx_rc_config_t;
 
 /*!\brief Information passed to the external rate control model to
@@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info {
 typedef struct vpx_rc_gop_decision {
   int gop_coding_frames; /**< The number of frames of this GOP */
   int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+  int use_key_frame;     /**< Whether to set key frame for this GOP */
 } vpx_rc_gop_decision_t;
 
 /*!\brief Create an external rate control model callback prototype
@@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
  * the external rate control model.
  *
  * \param[in]  rate_ctrl_model    rate control model
- * \param[in]  encode_frame_info  information of the coding frame
+ * \param[in]  frame_gop_index    index of the frame in current gop
  * \param[out] frame_decision     encode decision of the coding frame
  */
 typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
     vpx_rc_encodeframe_decision_t *frame_decision);
 
 /*!\brief Update encode frame result callback prototype
@@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
  * the external rate control model.
  *
  * \param[in]  rate_ctrl_model  rate control model
- * \param[in]  gop_info         information collected from the encoder
  * \param[out] gop_decision     GOP decision from the model
  */
 typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision);
+    vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision);
 
 /*!\brief Get the frame rdmult from the external rate control model.
  *
diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h
index a250aada60..7e4c9ab7e1 100644
--- a/media/libvpx/libvpx/vpx/vpx_tpl.h
+++ b/media/libvpx/libvpx/vpx/vpx_tpl.h
@@ -32,19 +32,21 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/
 
 /*!\brief Temporal dependency model stats for each block before propagation */
 typedef struct VpxTplBlockStats {
-  int16_t row;         /**< Pixel row of the top left corner */
-  int16_t col;         /**< Pixel col of the top left corner */
-  int64_t intra_cost;  /**< Intra cost */
-  int64_t inter_cost;  /**< Inter cost */
-  int16_t mv_r;        /**< Motion vector row */
-  int16_t mv_c;        /**< Motion vector col */
-  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
-  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
-  int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+  int16_t row;            /**< Pixel row of the top left corner */
+  int16_t col;            /**< Pixel col of the top left corner */
+  int64_t intra_cost;     /**< Intra cost */
+  int64_t inter_cost;     /**< Inter cost */
+  int16_t mv_r;           /**< Motion vector row */
+  int16_t mv_c;           /**< Motion vector col */
+  int64_t srcrf_rate;     /**< Rate from source ref frame */
+  int64_t srcrf_dist;     /**< Distortion from source ref frame */
+  int64_t inter_pred_err; /**< Inter prediction error */
+  int64_t intra_pred_err; /**< Intra prediction error */
+  int ref_frame_index;    /**< Ref frame index in the ref frame buffer */
 } VpxTplBlockStats;
 
 /*!\brief Temporal dependency model stats for each frame before propagation */
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 683df5797a..f8b94620d4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
                                                                                \
     if (xoffset == 0) {                                                        \
       if (yoffset == 0) {                                                      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp[w * h];                                                   \
         highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
                                       h);                                      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else {                                                                 \
         uint16_t tmp[w * h];                                                   \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
                                            src_stride, h, yoffset);            \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
                                       (h + 1));                                \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
                                       (h + 1));                                \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
@@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
                                            xoffset);                           \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
                                            (h + 1), xoffset);                  \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
                                            (h + 1), xoffset);                  \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     }                                                                          \
@@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
   } while (--i != 0);
 }
 
-#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
-  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
-      const uint8_t *second_pred) {                                           \
-    uint16_t tmp0[w * (h + 1)];                                               \
-    uint16_t tmp1[w * h];                                                     \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
-                                                                              \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
-                                       xoffset);                              \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
-                                                                              \
-    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
-        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
   }
 
 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
@@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
       if (yoffset == 0) {                                                      \
         highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
                         CONVERT_TO_SHORTPTR(second_pred));                     \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else if (yoffset == 4) {                                               \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             src_ptr, tmp, source_stride, source_stride, w, h,                  \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else {                                                                 \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
@@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
         highbd_avg_pred_var_filter_block2d_avg(                                \
             src_ptr, tmp0, source_stride, 1, w, h,                             \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * (h + 1)];                                            \
@@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                       (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * (h + 1)];                                            \
@@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                       (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
@@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
@@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                            (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
@@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                            (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     }                                                                          \
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..cebe06b099
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+                                          int src_stride,
+                                          const uint16_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
+  uint64x2_t sse = vdupq_n_u64(0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse = vpx_dotq_u16(sse, diff, diff);
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  return (uint32_t)horizontal_add_uint64x2(sse);
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h)                                      \
+  uint32_t vpx_highbd_10_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    h -= 2;
+  } while (h != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = vld1q_u16(src_ptr);
+    const uint16x8_t r = vld1q_u16(ref_ptr);
+
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s32 = vpadalq_s16(sum_s32, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    const uint16x8_t s0 = vld1q_u16(src_ptr);
+    const uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+    const uint16x8_t r0 = vld1q_u16(ref_ptr);
+    const uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+
+    sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+
+    sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+    sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+  int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                           vdupq_n_s64(0) };
+
+  do {
+    int i = 0;
+    do {
+      const uint16x8_t s0 = vld1q_u16(src_ptr + i);
+      const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8);
+      const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16);
+      const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24);
+
+      const uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16);
+      const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24);
+
+      const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+      const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+      const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+      const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3));
+
+      sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+      sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+      sum_s32[2] = vpadalq_s16(sum_s32[2], diff2);
+      sum_s32[3] = vpadalq_s16(sum_s32[3], diff3);
+
+      sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+      sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+      sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2);
+      sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3);
+
+      i += 32;
+    } while (i < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]);
+
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+  sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_SVE(w, h)                                    \
+  uint32_t vpx_highbd_8_variance##w##x##h##_sve(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_10_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+HBD_VARIANCE_WXH_SVE(4, 4)
+HBD_VARIANCE_WXH_SVE(4, 8)
+
+HBD_VARIANCE_WXH_SVE(8, 4)
+HBD_VARIANCE_WXH_SVE(8, 8)
+HBD_VARIANCE_WXH_SVE(8, 16)
+
+HBD_VARIANCE_WXH_SVE(16, 8)
+HBD_VARIANCE_WXH_SVE(16, 16)
+HBD_VARIANCE_WXH_SVE(16, 32)
+
+HBD_VARIANCE_WXH_SVE(32, 16)
+HBD_VARIANCE_WXH_SVE(32, 32)
+HBD_VARIANCE_WXH_SVE(32, 64)
+
+HBD_VARIANCE_WXH_SVE(64, 32)
+HBD_VARIANCE_WXH_SVE(64, 64)
+
+#define HIGHBD_GET_VAR_SVE(s)                                         \
+  void vpx_highbd_8_get##s##x##s##var_sve(                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
+
+HIGHBD_GET_VAR_SVE(8)
+HIGHBD_GET_VAR_SVE(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 47684473ca..b5a944d299 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -14,86 +14,51 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
-                            int16x4_t *const s0, int16x4_t *const s1,
-                            int16x4_t *const s2, int16x4_t *const s3) {
-  *s0 = vld1_s16(s);
-  s += p;
-  *s1 = vld1_s16(s);
-  s += p;
-  *s2 = vld1_s16(s);
-  s += p;
-  *s3 = vld1_s16(s);
-}
-
-static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
-                            uint16x8_t *const s0, uint16x8_t *const s1,
-                            uint16x8_t *const s2, uint16x8_t *const s3) {
-  *s0 = vld1q_u16(s);
-  s += p;
-  *s1 = vld1q_u16(s);
-  s += p;
-  *s2 = vld1q_u16(s);
-  s += p;
-  *s3 = vld1q_u16(s);
-}
-
-static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
-                            int16x8_t *const s0, int16x8_t *const s1,
-                            int16x8_t *const s2, int16x8_t *const s3,
-                            int16x8_t *const s4, int16x8_t *const s5,
-                            int16x8_t *const s6, int16x8_t *const s7) {
-  *s0 = vld1q_s16(s);
-  s += p;
-  *s1 = vld1q_s16(s);
-  s += p;
-  *s2 = vld1q_s16(s);
-  s += p;
-  *s3 = vld1q_s16(s);
-  s += p;
-  *s4 = vld1q_s16(s);
-  s += p;
-  *s5 = vld1q_s16(s);
-  s += p;
-  *s6 = vld1q_s16(s);
-  s += p;
-  *s7 = vld1q_s16(s);
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filters, 0);
+  sum = vmlal_lane_s16(sum, s1, filters, 1);
+  sum = vmlal_lane_s16(sum, s2, filters, 2);
+  sum = vmlal_lane_s16(sum, s3, filters, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
-static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
-                             const uint16x8_t s0, const uint16x8_t s1,
-                             const uint16x8_t s2, const uint16x8_t s3,
-                             const uint16x8_t s4, const uint16x8_t s5,
-                             const uint16x8_t s6, const uint16x8_t s7) {
-  vst1q_u16(s, s0);
-  s += p;
-  vst1q_u16(s, s1);
-  s += p;
-  vst1q_u16(s, s2);
-  s += p;
-  vst1q_u16(s, s3);
-  s += p;
-  vst1q_u16(s, s4);
-  s += p;
-  vst1q_u16(s, s5);
-  s += p;
-  vst1q_u16(s, s6);
-  s += p;
-  vst1q_u16(s, s7);
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
-static INLINE int32x4_t highbd_convolve8_4(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filters, const uint16x4_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum;
 
-  sum = vmull_lane_s16(s0, filters_lo, 0);
+  int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4(
   sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
-  return sum;
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t
@@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                    const int16x8_t filters, const uint16x8_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum0, sum1;
-  uint16x8_t d;
 
-  sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
-  sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
-  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
-  d = vminq_u16(d, max);
-  return d;
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filter, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filter, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filter, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
 }
 
 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
@@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
   if (x_step_q4 != 16) {
     vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-  } else {
-    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
-
-      do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+    return;
+  }
 
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        transpose_u16_4x4q(&d01, &d23);
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
 
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          transpose_u16_8x4(&d0, &d1, &d2, &d3);
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
-                                    max);
-            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
-                                    max);
-            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
-                                    max);
-            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
-                                    max);
-            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
-                                    max);
-            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
-                                    max);
-            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
-                                    max);
-            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
-                                    filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                                    x_filter_4tap, bd);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                                    x_filter_8tap, bd);
   }
 }
 
@@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
     vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
                                      bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_4tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t t0, t1, t2, t3;
-      uint16x8_t d01, d23, t01, t23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
 
       do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
-        transpose_u16_4x4q(&t01, &t23);
-
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 2 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
-
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max);
+        uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max);
+        uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max);
+        uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-          transpose_u16_8x4(&t0, &t1, &t2, &t3);
-
-          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                            vld1_u16(dst + 4 * dst_stride));
-          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                            vld1_u16(dst + 5 * dst_stride));
-          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                            vld1_u16(dst + 6 * dst_stride));
-          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
-                            vld1_u16(dst + 7 * dst_stride));
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
-                                    max);
-            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
-                                    max);
-            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
-                                    max);
-            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
-                                    max);
-            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
-                                    max);
-            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
-                                    max);
-            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
-                                    max);
-            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
-                                    filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
-            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
-            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
-            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
-            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
-            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
-            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
-            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
-
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
   if (y_step_q4 != 16) {
     vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
-  } else {
-    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3 * src_stride;
-
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+    return;
+  }
 
-      do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
 
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
+    highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_4tap, bd);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_8tap, bd);
   }
 }
 
@@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
     vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
                                     bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3 * src_stride;
-
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23, t01, t23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      s += 7 * src_stride;
 
       do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
-
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 1 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
-
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
 
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          d0 = vld1q_u16(d + 0 * dst_stride);
-          d1 = vld1q_u16(d + 1 * dst_stride);
-          d2 = vld1q_u16(d + 2 * dst_stride);
-          d3 = vld1q_u16(d + 3 * dst_stride);
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+static INLINE void highbd_convolve_2d_4tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+    const int16x4_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE void highbd_convolve_2d_8tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
+    const int16x8_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+    highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
+                                 dst, dst_stride, w, h, x_filter, y_filter, bd);
+    return;
+  }
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter, y_filter, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
   }
+
+  // Averaging convolution always uses an 8-tap filter.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  src = src - horiz_offset - vert_offset;
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+      d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+      d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+      d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
new file mode 100644
index 0000000000..7fc0a57c90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+                                                               1, 3, 5, 7 };
+
+static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x4_t max) {
+  int16x8_t s01 = vcombine_s16(s[0], s[1]);
+  int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0);
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x8_t max,
+                                            uint16x8_t idx) {
+  int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+  int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+  int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+
+  res = vpx_tbl_u16(res, idx);
+
+  return vminq_u16(res, max);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x4_t max) {
+  int64x2_t sum[4];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8],
+                                            const int16x8_t filter,
+                                            const uint16x8_t max) {
+  int64x2_t sum[8];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+  sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter);
+  sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter);
+  sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter);
+  sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter);
+
+  int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
+  int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
+  int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
+  int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) {
+  const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx);
+        uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx);
+        uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx);
+        uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h,
+                                   x_filter_4tap, bd);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h,
+                                   x_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+    return;
+  }
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
new file mode 100644
index 0000000000..4ed7718f7d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
@@ -0,0 +1,452 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 0, 5, 6, 7, 4,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 0, 1, 6, 7, 4, 5,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1,
+                                        const int16x4_t s2, const int16x4_t s3,
+                                        int16x8_t res[2]) {
+  // Transpose 16-bit elements:
+  // s0: 00, 01, 02, 03
+  // s1: 10, 11, 12, 13
+  // s2: 20, 21, 22, 23
+  // s3: 30, 31, 32, 33
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+
+  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+  int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+  int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
+
+  int32x4x2_t t0123 = vzipq_s32(s01, s23);
+
+  res[0] = vreinterpretq_s16_s32(t0123.val[0]);
+  res[1] = vreinterpretq_s16_s32(t0123.val[1]);
+}
+
+static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1,
+                                        const int16x8_t s2, const int16x8_t s3,
+                                        int16x8_t res[4]) {
+  // Transpose 16-bit elements:
+  // s0: 00, 01, 02, 03, 04, 05, 06, 07
+  // s1: 10, 11, 12, 13, 14, 15, 16, 17
+  // s2: 20, 21, 22, 23, 24, 25, 26, 27
+  // s3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+  // res[2]: 04 14 24 34 05 15 25 35
+  // res[3]: 06 16 26 36 07 17 27 37
+
+  int16x8x2_t s01 = vzipq_s16(s0, s1);
+  int16x8x2_t s23 = vzipq_s16(s2, s3);
+
+  int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]),
+                                   vreinterpretq_s32_s16(s23.val[0]));
+  int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]),
+                                   vreinterpretq_s32_s16(s23.val[1]));
+
+  res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]);
+  res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]);
+  res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]);
+  res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]);
+}
+
+static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
+                                  int16x8_t res[4], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+  res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
+  res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
+}
+
+static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
+                                  int16x8_t res[2], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
+                                              int16x8_t s_hi[2],
+                                              int16x8_t filter,
+                                              uint16x4_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
+                                              const int16x8_t s_hi[4],
+                                              const int16x8_t filter,
+                                              const uint16x8_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
+  sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
+
+  int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
+  sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+                                vqrshrun_n_s32(sum4567, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_8tap_vert_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  assert(w >= 4 && h >= 4);
+  uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+  // Correct indices by the size of vector length.
+  merge_tbl_idx.val[0] = vaddq_u16(
+      merge_tbl_idx.val[0],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+  merge_tbl_idx.val[1] = vaddq_u16(
+      merge_tbl_idx.val[1],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+  merge_tbl_idx.val[2] = vaddq_u16(
+      merge_tbl_idx.val[2],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+      vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+      vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+      vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, sA;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+        int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+        transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+        vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+        vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+        vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max);
+        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max);
+        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max);
+        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                   bd);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+  // Correct indices by the size of vector length.
+  merge_tbl_idx.val[0] = vaddq_u16(
+      merge_tbl_idx.val[0],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+  merge_tbl_idx.val[1] = vaddq_u16(
+      merge_tbl_idx.val[1],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+  merge_tbl_idx.val[2] = vaddq_u16(
+      merge_tbl_idx.val[2],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+      vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+      vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+      vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, sA;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+        int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+        transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+        vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+        vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+        vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
deleted file mode 100644
index 414ade3530..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
-                               uint16_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *filter, int x0_q4,
-                               int x_step_q4, int y0_q4, int y_step_q4, int w,
-                               int h, int bd) {
-  // + 1 to make it divisible by 4
-  uint16_t temp[64 * 136];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  intermediate_height, bd);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
-                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
-                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *filter, int x0_q4,
-                                   int x_step_q4, int y0_q4, int y_step_q4,
-                                   int w, int h, int bd) {
-  // + 1 to make it divisible by 4
-  uint16_t temp[64 * 136];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  intermediate_height, bd);
-  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
-                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
-                                     bd);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
index c54e588239..579096d78a 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_)  // flip_sign_16
 
 #define FUN_FLIP_SIGN_BACK(w, r)                                         \
   static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
-    const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
+    const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80);           \
     return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
   }
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
index 38b0b6c1a9..268c4bd962 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
   uint32_t a;
-  uint32x2_t a_u32;
-  if (stride == 4) return vld1_u8(buf);
+  uint32x2_t a_u32 = vdup_n_u32(0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vdup_n_u32(a);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
@@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
 static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
                                              ptrdiff_t stride) {
   uint64_t a;
-  uint64x2_t a_u64;
-  if (stride == 4) return vld1q_u16(buf);
+  uint64x2_t a_u64 = vdupq_n_u64(0);
   memcpy(&a, buf, 8);
   buf += stride;
-  a_u64 = vdupq_n_u64(a);
+  a_u64 = vsetq_lane_u64(a, a_u64, 0);
   memcpy(&a, buf, 8);
   a_u64 = vsetq_lane_u64(a, a_u64, 1);
   return vreinterpretq_u16_u64(a_u64);
@@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
-  if (stride == 4) {
-    vst1_u8(buf, a);
-    return;
-  }
   uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
@@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
                                             ptrdiff_t stride) {
   uint32_t a;
-  uint32x4_t a_u32;
-  if (stride == 4) return vld1q_u8(buf);
+  uint32x4_t a_u32 = vdupq_n_u32(0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vdupq_n_u32(a);
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
 static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
-  if (stride == 4) {
-    vst1q_u8(buf, a);
-    return;
-  }
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
@@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
   vst1q_u8(s, s7);
 }
 
+static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+}
+
+static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+  s += p;
+  vst1_u16(s, s3);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+}
+
 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
   *s7 = vld1q_u16(s);
 }
 
+static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6, int16x4_t *s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6, int16x8_t *s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..a18cbbd736
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) {
+  if (size == 4) {
+    int16x4_t s[4];
+    int64x2_t sum = vdupq_n_s64(0);
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+
+    int16x8_t s01 = vcombine_s16(s[0], s[1]);
+    int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+    sum = vpx_dotq_s16(sum, s01, s01);
+    sum = vpx_dotq_s16(sum, s23, s23);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum));
+  } else {
+    int rows = size;
+    int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                         vdupq_n_s64(0) };
+
+    do {
+      const int16_t *src_ptr = src;
+      int cols = size;
+
+      do {
+        int16x8_t s[8];
+        load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                     &s[6], &s[7]);
+
+        sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]);
+        sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]);
+        sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]);
+        sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]);
+        sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]);
+        sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]);
+        sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]);
+        sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]);
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      src += 8 * stride;
+      rows -= 8;
+    } while (rows);
+
+    sum[0] = vaddq_s64(sum[0], sum[1]);
+    sum[2] = vaddq_s64(sum[2], sum[3]);
+    sum[0] = vaddq_s64(sum[0], sum[2]);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0]));
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 74f85a6bb6..c989a6721b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
   *a7 = vreinterpretq_s32_s64(c3.val[1]);
 }
 
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
                                     uint8x8_t *a6, uint8x8_t *a7) {
-  // Swap 8 bit elements. Goes from:
+  // Widen to 128-bit registers (usually a no-op once inlined.)
+  const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+  const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+  const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+  const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+  const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+  const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+  const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+  const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+  // Zip 8 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
   // a2: 20 21 22 23 24 25 26 27
@@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   // a6: 60 61 62 63 64 65 66 67
   // a7: 70 71 72 73 74 75 76 77
   // to:
-  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
-  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
-  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
-  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
-
-  const uint8x16x2_t b0 =
-      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
-  const uint8x16x2_t b1 =
-      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
-
-  // Swap 16 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
-  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
-  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
-  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
-
-  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
-                                    vreinterpretq_u16_u8(b1.val[0]));
-  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
-                                    vreinterpretq_u16_u8(b1.val[1]));
-
-  // Unzip 32 bit elements resulting in:
+  // b0: 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // b1: 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // b2: 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // b3: 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+  const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+  const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+  const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
+
+  // Zip 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // c0.val[1]: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // c1.val[0]: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // c1.val[1]: 44 54 64 74 45 55 65 75  46 66 56 76 47 67 57 77
+  const uint16x8x2_t c0 =
+      vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+  const uint16x8x2_t c1 =
+      vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
+
+  // Zip 32 bit elements resulting in:
   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d0.val[1]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[0]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+  const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
                                     vreinterpretq_u32_u16(c1.val[0]));
-  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+  const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
                                     vreinterpretq_u32_u16(c1.val[1]));
 
   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
-  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
-  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
-  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
-  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 65fb67c984..037ea1142d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -20,44 +20,36 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-// Note:
-// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
-// 2. After refactoring the shared code in kernel loops with inline functions,
-// the decoder speed dropped a lot when using gcc compiler. Therefore there is
-// no refactoring for those parts by now.
-// 3. For horizontal convolve, there is an alternative optimization that
-// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
-// samples in each are read from memory: src, (src+1), (src+2), (src+3),
-// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
-// instructions. This optimization is much faster in speed unit test, but slowed
-// down the whole decoder by 5%.
-
-static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
-                                                ptrdiff_t src_stride,
-                                                uint8_t *dst,
-                                                ptrdiff_t dst_stride, int w,
-                                                int h, const int16x4_t filter) {
+static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t x_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
+                                     vdup_lane_u8(x_filter, 3),
+                                     vdup_lane_u8(x_filter, 4),
+                                     vdup_lane_u8(x_filter, 5) };
+
   if (w == 4) {
     do {
-      int16x4_t s0[4], s1[4];
-
-      int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
-      s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
-      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
-      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
-      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+      uint8x8_t s01[4];
 
-      int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
-      s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
-      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
-      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
-      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+      s01[0] = load_unaligned_u8(src + 0, src_stride);
+      s01[1] = load_unaligned_u8(src + 1, src_stride);
+      s01[2] = load_unaligned_u8(src + 2, src_stride);
+      s01[3] = load_unaligned_u8(src + 3, src_stride);
 
-      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
-      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
 
-      store_u8(dst, dst_stride, d01);
+      store_unaligned_u8(dst, dst_stride, d01);
 
       src += 2 * src_stride;
       dst += 2 * dst_stride;
@@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
       int width = w;
 
       do {
-        int16x8_t t0[2], t1[2];
-        int16x8_t s0[4], s1[4];
-
-        t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
-        s0[0] = vextq_s16(t0[0], t0[1], 0);
-        s0[1] = vextq_s16(t0[0], t0[1], 1);
-        s0[2] = vextq_s16(t0[0], t0[1], 2);
-        s0[3] = vextq_s16(t0[0], t0[1], 3);
-
-        t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
-        t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
-        s1[0] = vextq_s16(t1[0], t1[1], 0);
-        s1[1] = vextq_s16(t1[0], t1[1], 1);
-        s1[2] = vextq_s16(t1[0], t1[1], 2);
-        s1[3] = vextq_s16(t1[0], t1[1], 3);
-
-        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
-        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+        uint8x8_t s0[4], s1[4];
+
+        s0[0] = vld1_u8(s + 0);
+        s0[1] = vld1_u8(s + 1);
+        s0[2] = vld1_u8(s + 2);
+        s0[3] = vld1_u8(s + 3);
+
+        s1[0] = vld1_u8(s + src_stride + 0);
+        s1[1] = vld1_u8(s + src_stride + 1);
+        s1[2] = vld1_u8(s + src_stride + 2);
+        s1[3] = vld1_u8(s + src_stride + 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
 
         vst1_u8(d, d0);
         vst1_u8(d + dst_stride, d1);
@@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
-                                                ptrdiff_t src_stride,
-                                                uint8_t *dst,
-                                                ptrdiff_t dst_stride, int w,
-                                                int h, const int16x8_t filter) {
-  uint8x8_t t0, t1, t2, t3;
-
+static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
   if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       transpose_u8_4x4(&d01, &d23);
 
@@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
       w -= 4;
     } while (w != 0);
   } else {
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     if (w == 4) {
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+
         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
@@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
+        src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
-      uint8_t *d;
-      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t s11, s12, s13, s14;
-
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
+
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   (void)y0_q4;
   (void)y_step_q4;
 
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements.
-     */
-    const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
-    vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
-                                 x_filter_4tap);
+    convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                             x_filter);
   } else {
-    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
-    vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
-                                 x_filter_8tap);
+    convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                             x_filter);
   }
 }
 
@@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-  uint8x8_t t0, t1, t2, t3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23, dd01, dd23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       transpose_u8_4x4(&d01, &d23);
 
-      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
-      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     if (w == 4) {
-      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
-
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+
         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
-        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
-        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
-        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
-        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
 
         d04 = vrhadd_u8(d04, dd04);
         d15 = vrhadd_u8(d15, dd15);
@@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
+        src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
       } while (h != 0);
     } else {
-      uint8_t *d;
-      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t s11, s12, s13, s14;
-
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
+
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          uint8x8_t d5 =
+              convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
-                                               ptrdiff_t src_stride,
-                                               uint8_t *dst,
-                                               ptrdiff_t dst_stride, int w,
-                                               int h, const int16x4_t filter) {
-  if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
-
-    load_u8_8x3(src, src_stride, &t0, &t1, &t2);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-
-    src += 3 * src_stride;
-
-    do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve4_4(s0, s1, s2, s3, filter);
-      d1 = convolve4_4(s1, s2, s3, s4, filter);
-      d2 = convolve4_4(s2, s3, s4, s5, filter);
-      d3 = convolve4_4(s3, s4, s5, s6, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6;
-
-    do {
-      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-
-      s = src + 3 * src_stride;
-      d = dst;
-      height = h;
-
-      do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve4_8(s0, s1, s2, s3, filter);
-        d1 = convolve4_8(s1, s2, s3, s4, filter);
-        d2 = convolve4_8(s2, s3, s4, s5, filter);
-        d3 = convolve4_8(s3, s4, s5, s6, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
-                                               ptrdiff_t src_stride,
-                                               uint8_t *dst,
-                                               ptrdiff_t dst_stride, int w,
-                                               int h, const int16x8_t filter) {
+static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
   if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
 
     src += 7 * src_stride;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s = src + 7 * src_stride;
-      d = dst;
-      height = h;
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
 
       do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   (void)x_step_q4;
   (void)y_step_q4;
 
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements.
-     */
-    const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
-    vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
-                                w, h, y_filter_4tap);
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
-    vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
-                                dst_stride, w, h, y_filter_8tap);
+    convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
+                            w, h, y_filter);
   }
 }
 
@@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
 
     src += 7 * src_stride;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s = src + 7 * src_stride;
-      d = dst;
-      height = h;
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
 
       do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
         d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 4ecaee0f99..10cc761ccd 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,360 +17,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *filter, int x0_q4,
-                                         int x_step_q4, int y0_q4,
-                                         int y_step_q4, int w, int h);
-
-static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16_t permute_tbl) {
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  int8x16_t clamped_samples =
-      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
-  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
-                                                 const int8x16_t samples0_hi,
-                                                 const int8x16_t samples1_lo,
-                                                 const int8x16_t samples1_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
-  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
-  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h);
-
-static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
-                                                  const int8x8_t filters) {
-  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16_t permute_tbl) {
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
-
-  int32x4_t sum =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  /* First 4 output values. */
-  int32x4_t sum0 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum;
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
-  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
-                                                  const uint8x16_t samples0_hi,
-                                                  const uint8x16_t samples1_lo,
-                                                  const uint8x16_t samples1_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
-  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
-  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x3_t permute_tbl) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t filters) {
-  int16x4_t sum = vmul_lane_s16(s0, filters, 0);
-  sum = vmla_lane_s16(sum, s1, filters, 1);
-  sum = vmla_lane_s16(sum, s2, filters, 2);
-  sum = vmla_lane_s16(sum, s3, filters, 3);
-  return sum;
-}
-
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x4_t filters) {
-  int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters, 2);
-  sum = vmlaq_lane_s16(sum, s3, filters, 3);
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
@@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
                      filters);
 }
 
+// 2-tap (bilinear) filter values are always positive, but 4-tap filter values
+// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much
+// greater positive values to compensate. To use instructions that operate on
+// 8-bit types we also need the types to be unsigned. Subtracting the products
+// of taps 0 and 3 from the products of taps 1 and 2 always works given that
+// 2-tap filters are 0-padded.
+static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1,
+                                    const uint8x8_t s2, const uint8x8_t s3,
+                                    const uint8x8_t filter_taps[4]) {
+  uint16x8_t sum = vmull_u8(s1, filter_taps[1]);
+  sum = vmlal_u8(sum, s2, filter_taps[2]);
+  sum = vmlsl_u8(sum, s0, filter_taps[0]);
+  sum = vmlsl_u8(sum, s3, filter_taps[3]);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t y_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                     vdup_lane_u8(y_filter, 3),
+                                     vdup_lane_u8(y_filter, 4),
+                                     vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t s0, s1, s2;
+      load_u8_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x8_t s3, s4, s5, s6;
+        load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 00bac3b9cf..b05a49d3fe 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -20,270 +20,139 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
+// Filter values always sum to 128.
+#define FILTER_SUM 128
+
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-    d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-    d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
-      d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
 
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
 
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-  }
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
 }
 
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-    d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-    d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-  }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *filter, int x0_q4,
-                                         int x_step_q4, int y0_q4,
-                                         int y_step_q4, int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
-                                            dst_stride, w, h, x_filter_4tap,
-                                            correction_4tap, range_limit);
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-  } else {
-    vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
-                                            dst_stride, w, h, x_filter_8tap,
-                                            correction_8tap, range_limit);
-  }
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+static INLINE void convolve_4tap_horiz_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+static INLINE void convolve_8tap_horiz_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                       const InterpKernel *filter, int x0_q4,
                                       int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
@@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
     const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
-                                         w, h, x_filter_4tap, correction_4tap,
-                                         range_limit);
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
+                                     x_filter_4tap);
 
   } else {
-    vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
-                                         w, h, x_filter_8tap, correction_8tap,
-                                         range_limit);
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
+                                     x_filter_8tap);
   }
 }
 
@@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
                                           int x_step_q4, int y0_q4,
                                           int y_step_q4, int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
 
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
 }
 
 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
+                                        int8x8_t a3, int8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+  *b = vreinterpretq_s8_s16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+                                        int8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8x2_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+  *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+  *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x8_t range_limit) {
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+                                      const int8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-    do {
-      uint8x8_t t7, t8, t9, t10;
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      d0 = convolve4_4_sdot_partial(s0123, correction, filter);
-      d1 = convolve4_4_sdot_partial(s1234, correction, filter);
-      d2 = convolve4_4_sdot_partial(s2345, correction, filter);
-      d3 = convolve4_4_sdot_partial(s3456, correction, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
-        d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
-        d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
-        d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_lo = s78910_lo;
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_hi = s78910_hi;
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+                                      const int8x16_t samples0_hi,
+                                      const int8x16_t samples1_lo,
+                                      const int8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+static INLINE void convolve_8tap_vert_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x8_t range_limit) {
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t t7, t8, t9, t10;
-
       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      int8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
       s += 7 * src_stride;
 
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t t7, t8, t9, t10;
-
         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filter);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filter);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filter);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filter);
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                      const InterpKernel *filter, int x0_q4,
                                      int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h) {
-  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
@@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t y_filter_4tap =
-        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
-                                        dst_stride, w, h, y_filter_4tap,
-                                        correction_4tap, range_limit);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
-                                        dst_stride, w, h, y_filter_8tap,
-                                        correction_8tap, range_limit);
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter);
   }
 }
 
@@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
                                          int x_step_q4, int y0_q4,
                                          int y_step_q4, int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t t7, t8, t9, t10;
-
       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      int8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
       s += 7 * src_stride;
 
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t t7, t8, t9, t10;
-
         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
     } while (w != 0);
   }
 }
+
+static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t x_filter,
+                                                 const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
+                                  dst, dst_stride, w, h, x_filter_4tap,
+                                  y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
+                                  y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
+                                      dst, dst_stride, filter, x0_q4, x_step_q4,
+                                      y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index bcad1dd121..e582004133 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
 
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve4_4_usdot(s0, filter, perm_tbl);
-      d1 = convolve4_4_usdot(s1, filter, perm_tbl);
-      d2 = convolve4_4_usdot(s2, filter, perm_tbl);
-      d3 = convolve4_4_usdot(s3, filter, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve4_4_usdot(s0, filter, perm_tbl);
-    d1 = convolve4_4_usdot(s1, filter, perm_tbl);
-    d2 = convolve4_4_usdot(s2, filter, perm_tbl);
-    /* We halved the filter values so -1 from right shift. */
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-      d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-      d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width > 0);
-  }
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
 }
 
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      d1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      d2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      d3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_usdot(s0, filter, perm_tbl);
-    d1 = convolve8_4_usdot(s1, filter, perm_tbl);
-    d2 = convolve8_4_usdot(s2, filter, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-      d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-      d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width > 0);
-  }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
-                                         w, h, x_filter_4tap);
-
-  } else {
-    vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
-                                         w, h, x_filter_8tap);
-  }
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
 }
 
-static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
 
+static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve4_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve4_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve4_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve4_4_usdot(s3, filter, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
-
+static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
@@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
     const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
-                                      h, x_filter_4tap);
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h,
+                                  x_filter_4tap);
 
   } else {
-    vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
-                                      h, x_filter_8tap);
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h,
+                                  x_filter_8tap);
   }
 }
 
@@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                        int x_step_q4, int y0_q4, int y_step_q4,
                                        int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
 
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
+                                        uint8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+  *b = vreinterpretq_u8_u16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+                                        uint8x16_t *b0, uint8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8x2_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+  *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+  *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      d0 = convolve4_4_usdot_partial(s0123, filter);
-      d1 = convolve4_4_usdot_partial(s1234, filter);
-      d2 = convolve4_4_usdot_partial(s2345, filter);
-      d3 = convolve4_4_usdot_partial(s3456, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+                                      const uint8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
-        d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
-        d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
-        d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_lo = s78910_lo;
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_hi = s78910_hi;
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+                                      const uint8x16_t samples0_hi,
+                                      const uint8x16_t samples1_lo,
+                                      const uint8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+
+  // First 4 output values.
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int8x8_t filter) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
+      uint8x8_t s7, s8, s9, s10;
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      uint8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
+        uint8x8_t s7, s8, s9, s10;
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filter);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filter);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filter);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filter);
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                   const InterpKernel *filter, int x0_q4,
                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
-  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
@@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t y_filter_4tap =
-        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
-                                     dst_stride, w, h, y_filter_4tap);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
-                                     dst_stride, w, h, y_filter_8tap);
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+                                 dst_stride, w, h, y_filter);
   }
 }
 
@@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                       int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
+      uint8x8_t s7, s8, s9, s10;
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      uint8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
+        uint8x8_t s7, s8, s9, s10;
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src,
+                                              ptrdiff_t src_stride,
+                                              uint8_t *dst,
+                                              ptrdiff_t dst_stride, int w,
+                                              int h, const int8x8_t x_filter,
+                                              const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter_4tap,
+                               y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                                   im_block, im_stride, w, im_height,
+                                   x_filter_8tap);
+
+  convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h,
+                               y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride,
+                                   src_stride, im_block, im_stride, w,
+                                   im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride,
+                                   dst, dst_stride, filter, x0_q4, x_step_q4,
+                                   y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 57772ea668..de5fa29471 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  uint8_t temp[64 * 72];
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
 
   const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
-  const int intermediate_height = h + vert_filter_taps;
+  // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+  // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.)
+  const int im_height = h + vert_filter_taps;
   const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting border_offset lines back. The Neon implementation will
-   * ignore the given height and filter a multiple of 4 lines. Since this goes
-   * in to the temp buffer which has lots of extra room and is subsequently
-   * discarded this is safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
-                           w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                           intermediate_height);
+  // Filter starting border_offset rows back. The Neon implementation will
+  // ignore the given height and filter a multiple of 4 lines. Since this goes
+  // into the temporary buffer which has lots of extra room and is subsequently
+  // discarded this is safe if somewhat less than ideal.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
 
-  /* Step into the temp buffer border_offset lines to get actual frame data. */
-  vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
-                          x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+  // Step into the temporary buffer border_offset rows to get actual frame data.
+  vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+                          dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                          y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             const InterpKernel *filter, int x0_q4,
                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  uint8_t temp[64 * 72];
-  const int intermediate_height = h + 8;
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS;
+  const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                           intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                              x_step_q4, y0_q4, y_step_q4, w, h);
+  // This implementation has the same issues as above. In addition, we only want
+  // to average the values after both passes.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
+
+  vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride,
+                              dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                              y_step_q4, w, h);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
deleted file mode 100644
index 9d754fde17..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const InterpKernel *filter, int x0_q4,
-                                int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * (64 + 7). */
-  uint8_t temp[64 * 71];
-
-  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. */
-  const int intermediate_height = h + vert_filter_taps - 1;
-  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_dotprod(
-      src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
-      x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  h);
-}
-
-void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *filter, int x0_q4,
-                                    int x_step_q4, int y0_q4, int y_step_q4,
-                                    int w, int h) {
-  uint8_t temp[64 * 71];
-
-  /* Averaging convolution always uses an 8-tap filter. */
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
-                                      filter, x0_q4, x_step_q4, y0_q4,
-                                      y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
-                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
deleted file mode 100644
index d7cbb09ea6..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * (64 + 7). */
-  uint8_t temp[64 * 71];
-
-  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. */
-  const int intermediate_height = h + vert_filter_taps - 1;
-  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
-                                   temp, w, filter, x0_q4, x_step_q4, y0_q4,
-                                   y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
-                               filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                               h);
-}
-
-void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  uint8_t temp[64 * 71];
-
-  /* Averaging convolution always uses an 8-tap filter. */
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
-                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
-                                   w, intermediate_height);
-
-  vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
-                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
new file mode 100644
index 0000000000..bf9f18c7e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Some very useful instructions are exclusive to the SVE2 instruction set.
+// However, we can access these instructions from a predominantly Neon context
+// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors
+// as SVE vectors - with the high part of the SVE vector (if it's longer than
+// 128 bits) being "don't care".
+
+static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1,
+                                     uint16x8_t tbl) {
+  svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+                                      svset_neonq_s16(svundef_s16(), s1));
+  return svget_neonq_s16(
+      svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
new file mode 100644
index 0000000000..48534fb70e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x,
+                                      uint16x8_t y) {
+  return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                   svset_neonq_u16(svundef_u16(), x),
+                                   svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+  return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                   svset_neonq_s16(svundef_s16(), x),
+                                   svset_neonq_s16(svundef_s16(), y)));
+}
+
+#define vpx_dotq_lane_s16(acc, x, y, lane)                            \
+  svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \
+                                 svset_neonq_s16(svundef_s16(), x),   \
+                                 svset_neonq_s16(svundef_s16(), y), lane))
+
+static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) {
+  return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data),
+                                   svset_neonq_u16(svundef_u16(), indices)));
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index b8e3c5e540..9bd5ec285c 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -20,263 +20,271 @@
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void scaledconvolve_horiz_w4(
+static INLINE void scaledconvolve_horiz_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
 
   src -= SUBPEL_TAPS / 2 - 1;
 
-  y = h;
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
+  if (w == 4) {
     do {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8], d;
-          int16x8_t ss[4];
-          int16x4_t t[8], tt;
-
-          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
-          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
-          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-          t[0] = vget_low_s16(ss[0]);
-          t[1] = vget_low_s16(ss[1]);
-          t[2] = vget_low_s16(ss[2]);
-          t[3] = vget_low_s16(ss[3]);
-          t[4] = vget_high_s16(ss[0]);
-          t[5] = vget_high_s16(ss[1]);
-          t[6] = vget_high_s16(ss[2]);
-          t[7] = vget_high_s16(ss[3]);
-
-          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters);
-          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3;
+          load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+          transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+          int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+          int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+          int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d0 =
+              vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+          store_u8_4x1(&temp[4 * r], d0);
         } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 4; ++c) {
+            temp[r * 4 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 4x4 filters values back to dst
-      {
-        const uint8x8x4_t d4 = vld4_u8(temp);
-        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[0]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[1]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[2]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[3]), 0);
-      }
-      x += 4;
-    } while (x < w);
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
 
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    y -= 4;
-  } while (y > 0);
-}
+      transpose_u8_4x4(&d01, &d23);
 
-static INLINE void scaledconvolve_horiz_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
+      store_u8_4x1(dst + 0 * dst_stride, d01);
+      store_u8_4x1(dst + 1 * dst_stride, d23);
+      store_u8_4x1_high(dst + 2 * dst_stride, d01);
+      store_u8_4x1_high(dst + 3 * dst_stride, d23);
 
-  // This function processes 8x8 areas. The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = (h + 7) & ~7;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
 
   do {
     int x_q4 = x0_q4;
-    x = 0;
+    uint8_t *d = dst;
+    int width = w;
+
     do {
-      uint8x8_t d[8];
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
 
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8];
-          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
-                      &s[5], &s[6], &s[7]);
-          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                           &s[7]);
-          d[0] = scale_filter_8(s, filters);
-          vst1_u8(&temp[8 * z], d[0]);
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+          vst1_u8(&temp[r * 8], d0);
         } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 8; ++c) {
+            temp[r * 8 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 8x8 filters values back to dst
-      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
-                  &d[7]);
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
-      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
-      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
-      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
-      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
-      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
-      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
-      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
-      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
-      x += 8;
-    } while (x < w);
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-static INLINE void scaledconvolve_vert_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
+      transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      int16x4_t t[8], tt;
-
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
-      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
-      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
-      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
-      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
-      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
-      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
-      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
-      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
+      d += 8;
+      width -= 8;
+    } while (width != 0);
 
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
 }
 
-static INLINE void scaledconvolve_vert_w8(
+static INLINE void scaledconvolve_vert_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
   int y_q4 = y0_q4;
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      d = scale_filter_8(s, filters);
-      vst1_u8(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
 
-static INLINE void scaledconvolve_vert_w16(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int x, y;
-  int y_q4 = y0_q4;
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+        int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+        int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+        int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+        int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d0 =
+            vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  if (w == 8) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
   do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
     if (y_q4 & SUBPEL_MASK) {
-      x = 0;
       do {
-        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-        uint8x16_t ss[8];
-        uint8x8_t s[8], d[2];
-        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
-                     &ss[5], &ss[6], &ss[7]);
-        s[0] = vget_low_u8(ss[0]);
-        s[1] = vget_low_u8(ss[1]);
-        s[2] = vget_low_u8(ss[2]);
-        s[3] = vget_low_u8(ss[3]);
-        s[4] = vget_low_u8(ss[4]);
-        s[5] = vget_low_u8(ss[5]);
-        s[6] = vget_low_u8(ss[6]);
-        s[7] = vget_low_u8(ss[7]);
-        d[0] = scale_filter_8(s, filters);
-
-        s[0] = vget_high_u8(ss[0]);
-        s[1] = vget_high_u8(ss[1]);
-        s[2] = vget_high_u8(ss[2]);
-        s[3] = vget_high_u8(ss[3]);
-        s[4] = vget_high_u8(ss[4]);
-        s[5] = vget_high_u8(ss[5]);
-        s[6] = vget_high_u8(ss[6]);
-        s[7] = vget_high_u8(ss[7]);
-        d[1] = scale_filter_8(s, filters);
-        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
-        src_y += 16;
-        x += 16;
-      } while (x < w);
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+        s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+        s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+        s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+        s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+        s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+        s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+        s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+        s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+        s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+        s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+        s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+        s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+        uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                   s6[0], s7[0], filter);
+        uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                   s6[1], s7[1], filter);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        vst1q_u8(d, s0);
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     }
-    dst += dst_stride;
+
     y_q4 += y_step_q4;
-  } while (--y);
+    dst += dst_stride;
+  } while (--h != 0);
 }
 
 void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
   //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
+  // Deriving the maximum number of rows in the im_block buffer (135):
   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   // --Largest block size is 64x64 pixels.
   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
@@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // When calling in frame scaling function, the smallest scaling factor is x1/4
   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
   // big enough.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
   assert(x_step_q4 <= 64);
 
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  }
+  scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, im_block, im_stride, filter, x0_q4,
+                            x_step_q4, w, im_height);
 
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  }
+  scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+                           y_step_q4, w, h);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
index 2bee91f449..916dc62cef 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_SVE)   += arm/highbd_vpx_convolve8_sve.c
+DSP_SRCS-$(HAVE_SVE2)  += arm/highbd_vpx_convolve8_sve2.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
@@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
 DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
-DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
-DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
@@ -374,6 +373,7 @@ DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SVE)    += arm/sum_squares_sve.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
@@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
+DSP_SRCS-$(HAVE_SVE)    += arm/highbd_variance_sve.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
index 030c456d39..2b8c656afb 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
+void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 18087e25d9..f40f85c036 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
@@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride,
 specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
@@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x4 neon/;
+  specialize qw/vpx_highbd_12_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance4x8 neon/;
+  specialize qw/vpx_highbd_12_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance4x4 neon/;
+  specialize qw/vpx_highbd_12_variance4x4 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x4 neon/;
+  specialize qw/vpx_highbd_10_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance4x8 neon/;
+  specialize qw/vpx_highbd_10_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance4x4 neon/;
+  specialize qw/vpx_highbd_10_variance4x4 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x4 neon/;
+  specialize qw/vpx_highbd_8_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance4x8 neon/;
+  specialize qw/vpx_highbd_8_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance4x4 neon/;
+  specialize qw/vpx_highbd_8_variance4x4 neon sve/;
 
   add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon/;
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon/;
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x8 neon/;
+  specialize qw/vpx_highbd_10_mse16x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x16 neon/;
+  specialize qw/vpx_highbd_10_mse8x16 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x8 neon/;
+  specialize qw/vpx_highbd_12_mse16x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x16 neon/;
+  specialize qw/vpx_highbd_12_mse8x16 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
   specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
index 0cddcb6991..eb8ff06cd7 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -28,7 +28,6 @@ extern "C" {
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
-  assert(filter[3] != 128);
   if (filter[0] | filter[7]) {
     return 8;
   }
diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
index 539d09bb39..eba12d312a 100644
--- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
+++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
@@ -15,7 +15,7 @@
 #include <sys/sysctl.h>
 #endif
 
-#if !CONFIG_RUNTIME_CPU_DETECT
+#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
 
 static int arm_get_cpu_caps(void) {
   // This function should actually be a no-op. There is no way to adjust any of
@@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) {
   return flags;
 }
 
-#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
 
 // sysctlbyname() parameter documentation for instruction set characteristics:
 // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
@@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) {
 // hwcap values are not defined should not prevent features from being enabled.
 #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
 #define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1)
 #define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
+#if HAVE_NEON_DOTPROD || HAVE_SVE
   unsigned long hwcap = getauxval(AT_HWCAP);
-#if HAVE_NEON_I8MM
+#endif  // HAVE_NEON_DOTPROD || HAVE_SVE
+#if HAVE_NEON_I8MM || HAVE_SVE2
   unsigned long hwcap2 = getauxval(AT_HWCAP2);
-#endif  // HAVE_NEON_I8MM
+#endif  // HAVE_NEON_I8MM || HAVE_SVE2
 #if HAVE_NEON
   flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
 #endif  // HAVE_NEON
@@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) {
     flags |= HAS_SVE;
   }
 #endif  // HAVE_SVE
+#if HAVE_SVE2
+  if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) {
+    flags |= HAS_SVE2;
+  }
+#endif  // HAVE_SVE2
   return flags;
 }
 
@@ -195,5 +203,10 @@ int arm_cpu_caps(void) {
     flags &= ~HAS_SVE;
   }
 
+  // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available.
+  if (!(flags & HAS_SVE)) {
+    flags &= ~HAS_SVE2;
+  }
+
   return flags;
 }
diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h
index 39365d18ee..814c3cc408 100644
--- a/media/libvpx/libvpx/vpx_ports/arm.h
+++ b/media/libvpx/libvpx/vpx_ports/arm.h
@@ -25,6 +25,8 @@ extern "C" {
 #define HAS_NEON_I8MM (1 << 2)
 // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
 #define HAS_SVE (1 << 3)
+// Armv9.0-A SVE2 instructions.
+#define HAS_SVE2 (1 << 4)
 
 int arm_cpu_caps(void);
 
diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
index f1036b98ed..79b98a75f1 100644
--- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
@@ -12,4 +12,4 @@
 
 #include "vpx_ports/system_state.h"
 
-void vpx_clear_system_state() { _mm_empty(); }
+void vpx_clear_system_state(void) { _mm_empty(); }
diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h
index 5eccfe8f50..ee9e095633 100644
--- a/media/libvpx/libvpx/vpx_ports/mem.h
+++ b/media/libvpx/libvpx/vpx_ports/mem.h
@@ -23,7 +23,13 @@
 #define DECLARE_ALIGNED(n, typ, val) typ val
 #endif
 
-#if HAVE_NEON && defined(_MSC_VER)
+#if defined(__has_builtin)
+#define VPX_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define VPX_HAS_BUILTIN(x) 0
+#endif
+
+#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__)
 #define __builtin_prefetch(x)
 #endif
 
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h
index d8a8ed89fe..d33eff4397 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_once.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h
@@ -91,29 +91,6 @@ static void once(void (*func)(void)) {
   return;
 }
 
-#elif CONFIG_MULTITHREAD && defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>
-static void once(void (*func)(void)) {
-  static volatile int done;
-
-  /* If the initialization is complete, return early. */
-  if (done) return;
-
-  /* Causes all other threads in the process to block themselves
-   * and give up their time slice.
-   */
-  DosEnterCritSec();
-
-  if (!done) {
-    func();
-    done = 1;
-  }
-
-  /* Restores normal thread dispatching for the current process. */
-  DosExitCritSec();
-}
-
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void)) {
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
index dc4d9593a8..706b0770c8 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd() { once(setup_rtcd_internal); }
+void vpx_scale_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
new file mode 100644
index 0000000000..cdd18d0f30
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
@@ -0,0 +1,157 @@
+// Copyright 2024 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// pthread.h wrapper
+
+#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_
+#define VPX_VPX_UTIL_VPX_PTHREAD_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_MULTITHREAD
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <errno.h>    // NOLINT
+#include <process.h>  // NOLINT
+#include <stddef.h>   // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_EXIT_SUCCESS 0
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 unsigned int(__stdcall *start)(void *),
+                                 void *arg) {
+  (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,          /* lpThreadAttributes */
+                         0,             /* dwStackSize */
+                         start, arg, 0, /* dwStackSize */
+                         NULL);         /* lpThreadId */
+#else
+  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
+                                      0,             /* unsigned stack_size */
+                                      start, arg, 0, /* unsigned initflag */
+                                      NULL);         /* unsigned *thrdaddr */
+#endif
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+              WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  (void)condition;
+  return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  (void)cond_attr;
+  InitializeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  WakeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  WakeAllConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+  return !ok;
+}
+#else                 // _WIN32
+#include <pthread.h>  // NOLINT
+#define THREADFN void *
+#define THREAD_EXIT_SUCCESS NULL
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+#endif  // VPX_VPX_UTIL_VPX_PTHREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
index 04c5fb6f26..0d0e2f5766 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.c
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -12,10 +12,18 @@
 // Original source:
 //  https://chromium.googlesource.com/webm/libwebp
 
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <assert.h>
 #include <string.h>  // for memset()
+#include "./vpx_config.h"
 #include "./vpx_thread.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 
 #if CONFIG_MULTITHREAD
 
@@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker);  // Forward declaration.
 
 static THREADFN thread_loop(void *ptr) {
   VPxWorker *const worker = (VPxWorker *)ptr;
-  int done = 0;
-  while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    while (worker->status_ == OK) {  // wait in idling mode
+#ifdef __APPLE__
+  if (worker->thread_name != NULL) {
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. The maximum size of the thread_name buffer was
+    // noted in the Chromium source code and was confirmed by experiments. If
+    // thread_name is too long, pthread_setname_np returns -1 with errno
+    // ENAMETOOLONG (63).
+    char thread_name[64];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(thread_name);
+  }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+  if (worker->thread_name != NULL) {
+    // Linux and Android require names (with nul) fit in 16 chars, otherwise
+    // pthread_setname_np() returns ERANGE (34).
+    char thread_name[16];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(pthread_self(), thread_name);
+  }
+#endif
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  for (;;) {
+    while (worker->status_ == VPX_WORKER_STATUS_OK) {  // wait in idling mode
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
-    if (worker->status_ == WORK) {
+    if (worker->status_ == VPX_WORKER_STATUS_WORKING) {
+      // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread
+      // doesn't change worker->status_ and will wait until the worker changes
+      // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the
+      // worker can safely call execute() without holding worker->impl_->mutex_.
+      // When the worker reacquires worker->impl_->mutex_, worker->status_ must
+      // still be VPX_WORKER_STATUS_WORKING.
+      pthread_mutex_unlock(&worker->impl_->mutex_);
       execute(worker);
-      worker->status_ = OK;
-    } else if (worker->status_ == NOT_OK) {  // finish the worker
-      done = 1;
+      pthread_mutex_lock(&worker->impl_->mutex_);
+      assert(worker->status_ == VPX_WORKER_STATUS_WORKING);
+      worker->status_ = VPX_WORKER_STATUS_OK;
+      // signal to the main thread that we're done (for sync())
+      pthread_cond_signal(&worker->impl_->condition_);
+    } else {
+      assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);  // finish the worker
+      break;
     }
-    // signal to the main thread that we're done (for sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
-  return THREAD_RETURN(NULL);  // Thread is finished
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+  return THREAD_EXIT_SUCCESS;  // Thread is finished
 }
 
 // main thread state control
@@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
   if (worker->impl_ == NULL) return;
 
   pthread_mutex_lock(&worker->impl_->mutex_);
-  if (worker->status_ >= OK) {
+  if (worker->status_ >= VPX_WORKER_STATUS_OK) {
     // wait for the worker to finish
-    while (worker->status_ != OK) {
+    while (worker->status_ != VPX_WORKER_STATUS_OK) {
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     // assign new status and release the working thread if needed
-    if (new_status != OK) {
+    if (new_status != VPX_WORKER_STATUS_OK) {
       worker->status_ = new_status;
       pthread_cond_signal(&worker->impl_->condition_);
     }
@@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
 
 static void init(VPxWorker *const worker) {
   memset(worker, 0, sizeof(*worker));
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
 }
 
 static int sync(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, OK);
+  change_state(worker, VPX_WORKER_STATUS_OK);
 #endif
-  assert(worker->status_ <= OK);
+  assert(worker->status_ <= VPX_WORKER_STATUS_OK);
   return !worker->had_error;
 }
 
 static int reset(VPxWorker *const worker) {
   int ok = 1;
   worker->had_error = 0;
-  if (worker->status_ < OK) {
+  if (worker->status_ < VPX_WORKER_STATUS_OK) {
 #if CONFIG_MULTITHREAD
     worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
     if (worker->impl_ == NULL) {
@@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) {
     }
     pthread_mutex_lock(&worker->impl_->mutex_);
     ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
-    if (ok) worker->status_ = OK;
+    if (ok) worker->status_ = VPX_WORKER_STATUS_OK;
     pthread_mutex_unlock(&worker->impl_->mutex_);
     if (!ok) {
       pthread_mutex_destroy(&worker->impl_->mutex_);
@@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) {
       return 0;
     }
 #else
-    worker->status_ = OK;
+    worker->status_ = VPX_WORKER_STATUS_OK;
 #endif
-  } else if (worker->status_ > OK) {
+  } else if (worker->status_ > VPX_WORKER_STATUS_OK) {
     ok = sync(worker);
   }
-  assert(!ok || (worker->status_ == OK));
+  assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK));
   return ok;
 }
 
@@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) {
 
 static void launch(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, WORK);
+  change_state(worker, VPX_WORKER_STATUS_WORKING);
 #else
   execute(worker);
 #endif
@@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) {
 static void end(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   if (worker->impl_ != NULL) {
-    change_state(worker, NOT_OK);
+    change_state(worker, VPX_WORKER_STATUS_NOT_OK);
     pthread_join(worker->impl_->thread_, NULL);
     pthread_mutex_destroy(&worker->impl_->mutex_);
     pthread_cond_destroy(&worker->impl_->condition_);
@@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) {
     worker->impl_ = NULL;
   }
 #else
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
   assert(worker->impl_ == NULL);
 #endif
-  assert(worker->status_ == NOT_OK);
+  assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);
 }
 
 //------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
index 6d308e949b..11a1d74387 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.h
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -15,370 +15,22 @@
 #ifndef VPX_VPX_UTIL_VPX_THREAD_H_
 #define VPX_VPX_UTIL_VPX_THREAD_H_
 
-#include "./vpx_config.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
-
-#if CONFIG_MULTITHREAD
-
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-#include <errno.h>    // NOLINT
-#include <process.h>  // NOLINT
-#include <windows.h>  // NOLINT
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
-typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
-#endif  // _WIN32_WINNT >= 0x600
-
-#ifndef WINAPI_FAMILY_PARTITION
-#define WINAPI_PARTITION_DESKTOP 1
-#define WINAPI_FAMILY_PARTITION(x) x
-#endif
-
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define USE_CREATE_THREAD
-#endif
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-// _beginthreadex requires __stdcall
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREADFN unsigned int __stdcall
-#endif
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
-  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 unsigned int(__stdcall *start)(void *),
-                                 void *arg) {
-  (void)attr;
-#ifdef USE_CREATE_THREAD
-  *thread = CreateThread(NULL,          /* lpThreadAttributes */
-                         0,             /* dwStackSize */
-                         start, arg, 0, /* dwStackSize */
-                         NULL);         /* lpThreadId */
-#else
-  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
-                                      0,             /* unsigned stack_size */
-                                      start, arg, 0, /* unsigned initflag */
-                                      NULL);         /* unsigned *thrdaddr */
-#endif
-  if (*thread == NULL) return 1;
-  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
-          CloseHandle(thread) == 0);
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
-  InitializeCriticalSection(mutex);
-#endif
-  return 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  EnterCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  LeaveCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  DeleteCriticalSection(mutex);
-  return 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  (void)condition;
-#else
-  ok &= (CloseHandle(condition->waiting_sem_) != 0);
-  ok &= (CloseHandle(condition->received_sem_) != 0);
-  ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  InitializeConditionVariable(condition);
-#else
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
-  if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
-      condition->signal_event_ == NULL) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-#endif
-  return 0;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  WakeAllConditionVariable(condition);
-#else
-  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok &= SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  WakeConditionVariable(condition);
-#else
-  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok = SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
-  // note that there is a consumer available so the signal isn't dropped in
-  // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
-  // now unlock the mutex so pthread_cond_signal may be issued
-  pthread_mutex_unlock(mutex);
-  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
-        WAIT_OBJECT_0);
-  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
-  pthread_mutex_lock(mutex);
-#endif
-  return !ok;
-}
-
-#elif defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>  // NOLINT
-
-#include <errno.h>        // NOLINT
-#include <stdlib.h>       // NOLINT
-#include <sys/builtin.h>  // NOLINT
-
-#if defined(__STRICT_ANSI__)
-// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
-int _beginthread(void (*)(void *), void *, unsigned, void *);
-#endif
-
-#define pthread_t TID
-#define pthread_mutex_t HMTX
-
-typedef struct {
-  HEV event_sem_;
-  HEV ack_sem_;
-  volatile unsigned wait_count_;
-} pthread_cond_t;
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#define THREADFN void *
-#define THREAD_RETURN(val) (val)
-
-typedef struct {
-  void *(*start_)(void *);
-  void *arg_;
-} thread_arg;
-
-static void thread_start(void *arg) {
-  thread_arg targ = *(thread_arg *)arg;
-  free(arg);
-
-  targ.start_(targ.arg_);
-}
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 void *(*start)(void *), void *arg) {
-  int tid;
-  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
-  if (targ == NULL) return 1;
-
-  (void)attr;
-
-  targ->start_ = start;
-  targ->arg_ = arg;
-  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
-  if (tid == -1) {
-    free(targ);
-    return 1;
-  }
-
-  *thread = tid;
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return DosWaitThread(&thread, DCWW_WAIT) != 0;
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  return DosReleaseMutexSem(*mutex) != 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  return DosCloseMutexSem(*mutex) != 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-  ok &= DosCloseEventSem(condition->event_sem_) == 0;
-  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  int ok = 1;
-  (void)cond_attr;
-
-  ok &=
-      DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
-  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
-  if (!ok) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-  condition->wait_count_ = 0;
-  return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
-    ok &= DosPostEventSem(condition->event_sem_) == 0;
-    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
-  }
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
-    ok &= pthread_cond_signal(condition) == 0;
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok = 1;
-
-  __atomic_increment(&condition->wait_count_);
-
-  ok &= pthread_mutex_unlock(mutex) == 0;
-
-  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
-
-  __atomic_decrement(&condition->wait_count_);
-
-  ok &= DosPostEventSem(condition->ack_sem_) == 0;
-
-  pthread_mutex_lock(mutex);
-
-  return !ok;
-}
-#else                 // _WIN32
-#include <pthread.h>  // NOLINT
-#define THREADFN void *
-#define THREAD_RETURN(val) val
-#endif
-
-#endif  // CONFIG_MULTITHREAD
+#define MAX_NUM_THREADS 64
 
 // State of the worker thread object
 typedef enum {
-  NOT_OK = 0,  // object is unusable
-  OK,          // ready to work
-  WORK         // busy finishing the current task
+  VPX_WORKER_STATUS_NOT_OK = 0,  // object is unusable
+  VPX_WORKER_STATUS_OK,          // ready to work
+  VPX_WORKER_STATUS_WORKING      // busy finishing the current task
 } VPxWorkerStatus;
 
 // Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
 typedef int (*VPxWorkerHook)(void *, void *);
 
 // Platform-dependent implementation details for the worker.
@@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl;
 typedef struct {
   VPxWorkerImpl *impl_;
   VPxWorkerStatus status_;
+  // Thread name for the debugger. If not NULL, must point to a string that
+  // outlives the worker thread. For portability, use a name <= 15 characters
+  // long (not including the terminating NUL character).
+  const char *thread_name;
   VPxWorkerHook hook;  // hook to call
   void *data1;         // first argument passed to 'hook'
   void *data2;         // second argument passed to 'hook'
-  int had_error;       // return value of the last call to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
 } VPxWorker;
 
 // The interface for all thread-worker related functions. All these functions
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
index 1162714956..948e6d6f89 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_util.mk
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -10,6 +10,7 @@
 
 UTIL_SRCS-yes += vpx_atomics.h
 UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_pthread.h
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
commit	d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree	eff41dc61d9f714852212739e6b3738b82a2af87 /media/libvpx/libvpx
parent	Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download	firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip