238 files changed, 10624 insertions, 6869 deletions
diff --git a/media/ffvpx/README_MOZILLA b/media/ffvpx/README_MOZILLA
index 1c00f2761a..b766be4abd 100644
--- a/media/ffvpx/README_MOZILLA
+++ b/media/ffvpx/README_MOZILLA
@@ -158,6 +158,8 @@ There are going to be a lot of changes in terms of symbols exported. Adjust
 `libavutil/avutil.symbols` and `libavcodec/avcodec.symbols` by removing and
 adding symbols until the build passes.
 
-Finally, apply the patch:
+Finally, apply the patches:
 - no-unicode-stdio.patch to avoid passing the infity symbol in unicode to an
   stdio.h function, that causes bug 1879740 issue on Windows.
+- opusenc-dtx.patch to allow enabling DTX in the opus encoder.
+
diff --git a/media/ffvpx/config_components_audio_only.h b/media/ffvpx/config_components_audio_only.h
index 0e61e23898..4ba265a9f7 100644
--- a/media/ffvpx/config_components_audio_only.h
+++ b/media/ffvpx/config_components_audio_only.h
@@ -787,7 +787,7 @@
 #define CONFIG_LIBMP3LAME_ENCODER 0
 #define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
 #define CONFIG_LIBOPENJPEG_ENCODER 0
-#define CONFIG_LIBOPUS_ENCODER 0
+#define CONFIG_LIBOPUS_ENCODER 1
 #define CONFIG_LIBRAV1E_ENCODER 0
 #define CONFIG_LIBSHINE_ENCODER 0
 #define CONFIG_LIBSPEEX_ENCODER 0
@@ -795,7 +795,7 @@
 #define CONFIG_LIBTHEORA_ENCODER 0
 #define CONFIG_LIBTWOLAME_ENCODER 0
 #define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVORBIS_ENCODER 1
 #define CONFIG_LIBVPX_VP8_ENCODER 0
 #define CONFIG_LIBVPX_VP9_ENCODER 0
 #define CONFIG_LIBWEBP_ANIM_ENCODER 0
diff --git a/media/ffvpx/config_components_audio_video.h b/media/ffvpx/config_components_audio_video.h
index c8423a895e..220eb6ca52 100644
--- a/media/ffvpx/config_components_audio_video.h
+++ b/media/ffvpx/config_components_audio_video.h
@@ -810,7 +810,7 @@
 #define CONFIG_LIBMP3LAME_ENCODER 0
 #define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
 #define CONFIG_LIBOPENJPEG_ENCODER 0
-#define CONFIG_LIBOPUS_ENCODER 0
+#define CONFIG_LIBOPUS_ENCODER 1
 #define CONFIG_LIBRAV1E_ENCODER 0
 #define CONFIG_LIBSHINE_ENCODER 0
 #define CONFIG_LIBSPEEX_ENCODER 0
@@ -818,7 +818,7 @@
 #define CONFIG_LIBTHEORA_ENCODER 0
 #define CONFIG_LIBTWOLAME_ENCODER 0
 #define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVORBIS_ENCODER 1
 #define CONFIG_LIBVPX_VP8_ENCODER 1
 #define CONFIG_LIBVPX_VP9_ENCODER 1
 #define CONFIG_LIBWEBP_ANIM_ENCODER 0
diff --git a/media/ffvpx/libavcodec/audio_frame_queue.c b/media/ffvpx/libavcodec/audio_frame_queue.c
new file mode 100644
index 0000000000..08b4b368c7
--- /dev/null
+++ b/media/ffvpx/libavcodec/audio_frame_queue.c
@@ -0,0 +1,113 @@
+/*
+ * Audio Frame Queue
+ * Copyright (c) 2012 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "audio_frame_queue.h"
+#include "encode.h"
+#include "libavutil/avassert.h"
+
+av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
+{
+    afq->avctx = avctx;
+    afq->remaining_delay   = avctx->initial_padding;
+    afq->remaining_samples = avctx->initial_padding;
+    afq->frame_count       = 0;
+}
+
+void ff_af_queue_close(AudioFrameQueue *afq)
+{
+    if(afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count);
+    av_freep(&afq->frames);
+    memset(afq, 0, sizeof(*afq));
+}
+
+int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
+{
+    AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1));
+    if(!new)
+        return AVERROR(ENOMEM);
+    afq->frames = new;
+    new += afq->frame_count;
+
+    /* get frame parameters */
+    new->duration = f->nb_samples;
+    new->duration += afq->remaining_delay;
+    if (f->pts != AV_NOPTS_VALUE) {
+        new->pts = av_rescale_q(f->pts,
+                                      afq->avctx->time_base,
+                                      (AVRational){ 1, afq->avctx->sample_rate });
+        new->pts -= afq->remaining_delay;
+        if(afq->frame_count && new[-1].pts >= new->pts)
+            av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n");
+    } else {
+        new->pts = AV_NOPTS_VALUE;
+    }
+    afq->remaining_delay = 0;
+
+    /* add frame sample count */
+    afq->remaining_samples += f->nb_samples;
+
+    afq->frame_count++;
+
+    return 0;
+}
+
+void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
+                        int64_t *duration)
+{
+    int64_t out_pts = AV_NOPTS_VALUE;
+    int removed_samples = 0;
+    int i;
+
+    if (afq->frame_count || afq->frame_alloc) {
+        if (afq->frames->pts != AV_NOPTS_VALUE)
+            out_pts = afq->frames->pts;
+    }
+    if(!afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
+    if (pts)
+        *pts = ff_samples_to_time_base(afq->avctx, out_pts);
+
+    for(i=0; nb_samples && i<afq->frame_count; i++){
+        int n= FFMIN(afq->frames[i].duration, nb_samples);
+        afq->frames[i].duration -= n;
+        nb_samples              -= n;
+        removed_samples         += n;
+        if(afq->frames[i].pts != AV_NOPTS_VALUE)
+            afq->frames[i].pts      += n;
+    }
+    afq->remaining_samples -= removed_samples;
+    i -= i && afq->frames[i-1].duration;
+    memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
+    afq->frame_count -= i;
+
+    if(nb_samples){
+        av_assert0(!afq->frame_count);
+        av_assert0(afq->remaining_samples == afq->remaining_delay);
+        if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
+            afq->frames[0].pts += nb_samples;
+        av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
+    }
+    if (duration)
+        *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
+}
diff --git a/media/ffvpx/libavcodec/audio_frame_queue.h b/media/ffvpx/libavcodec/audio_frame_queue.h
new file mode 100644
index 0000000000..d8076eae54
--- /dev/null
+++ b/media/ffvpx/libavcodec/audio_frame_queue.h
@@ -0,0 +1,83 @@
+/*
+ * Audio Frame Queue
+ * Copyright (c) 2012 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AUDIO_FRAME_QUEUE_H
+#define AVCODEC_AUDIO_FRAME_QUEUE_H
+
+#include "avcodec.h"
+
+typedef struct AudioFrame {
+    int64_t pts;
+    int duration;
+} AudioFrame;
+
+typedef struct AudioFrameQueue {
+    AVCodecContext *avctx;
+    int remaining_delay;
+    int remaining_samples;
+    AudioFrame *frames;
+    unsigned frame_count;
+    unsigned frame_alloc;
+} AudioFrameQueue;
+
+/**
+ * Initialize AudioFrameQueue.
+ *
+ * @param avctx context to use for time_base and av_log
+ * @param afq   queue context
+ */
+void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq);
+
+/**
+ * Close AudioFrameQueue.
+ *
+ * Frees memory if needed.
+ *
+ * @param afq queue context
+ */
+void ff_af_queue_close(AudioFrameQueue *afq);
+
+/**
+ * Add a frame to the queue.
+ *
+ * @param afq queue context
+ * @param f   frame to add to the queue
+ */
+int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f);
+
+/**
+ * Remove frame(s) from the queue.
+ *
+ * Retrieves the pts of the next available frame, or a generated pts based on
+ * the last frame duration if there are no frames left in the queue. The number
+ * of requested samples should be the full number of samples represented by the
+ * packet that will be output by the encoder. If fewer samples are available
+ * in the queue, a smaller value will be used for the output duration.
+ *
+ * @param afq           queue context
+ * @param nb_samples    number of samples to remove from the queue
+ * @param[out] pts      output packet pts
+ * @param[out] duration output packet duration
+ */
+void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
+                        int64_t *duration);
+
+#endif /* AVCODEC_AUDIO_FRAME_QUEUE_H */
diff --git a/media/ffvpx/libavcodec/codec_list.c b/media/ffvpx/libavcodec/codec_list.c
index 04259e3cd7..7c6b0ceacd 100644
--- a/media/ffvpx/libavcodec/codec_list.c
+++ b/media/ffvpx/libavcodec/codec_list.c
@@ -20,6 +20,9 @@ static const FFCodec * const codec_list[] = {
 #if CONFIG_LIBVORBIS_DECODER
     &ff_libvorbis_decoder,
 #endif
+#if CONFIG_LIBVORBIS_ENCODER
+    &ff_libvorbis_encoder,
+#endif
 #if CONFIG_PCM_ALAW_DECODER
     &ff_pcm_alaw_decoder,
 #endif
@@ -44,6 +47,9 @@ static const FFCodec * const codec_list[] = {
 #if CONFIG_LIBOPUS_DECODER
     &ff_libopus_decoder,
 #endif
+#if CONFIG_LIBOPUS_ENCODER
+    &ff_libopus_encoder,
+#endif
 #if CONFIG_LIBVPX_VP8_DECODER
     &ff_libvpx_vp8_decoder,
 #endif
diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c
new file mode 100644
index 0000000000..68667e3350
--- /dev/null
+++ b/media/ffvpx/libavcodec/libopusenc.c
@@ -0,0 +1,610 @@
+/*
+ * Opus encoder using libopus
+ * Copyright (c) 2012 Nathan Caldwell
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <opus.h>
+#include <opus_multistream.h>
+
+#include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "libopus.h"
+#include "audio_frame_queue.h"
+#include "vorbis_data.h"
+
+typedef struct LibopusEncOpts {
+    int vbr;
+    int application;
+    int packet_loss;
+    int fec;
+    int complexity;
+    float frame_duration;
+    int packet_size;
+    int max_bandwidth;
+    int mapping_family;
+    int dtx;
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    int apply_phase_inv;
+#endif
+} LibopusEncOpts;
+
+typedef struct LibopusEncContext {
+    AVClass *class;
+    OpusMSEncoder *enc;
+    int stream_count;
+    uint8_t *samples;
+    LibopusEncOpts opts;
+    AudioFrameQueue afq;
+    const uint8_t *encoder_channel_map;
+} LibopusEncContext;
+
+static const uint8_t opus_coupled_streams[8] = {
+    0, 1, 1, 2, 2, 2, 2, 3
+};
+
+/* Opus internal to Vorbis channel order mapping written in the header */
+static const uint8_t opus_vorbis_channel_map[8][8] = {
+    { 0 },
+    { 0, 1 },
+    { 0, 2, 1 },
+    { 0, 1, 2, 3 },
+    { 0, 4, 1, 2, 3 },
+    { 0, 4, 1, 2, 3, 5 },
+    { 0, 4, 1, 2, 3, 5, 6 },
+    { 0, 6, 1, 2, 3, 4, 5, 7 },
+};
+
+/* libavcodec to libopus channel order mapping, passed to libopus */
+static const uint8_t libavcodec_libopus_channel_map[8][8] = {
+    { 0 },
+    { 0, 1 },
+    { 0, 1, 2 },
+    { 0, 1, 2, 3 },
+    { 0, 1, 3, 4, 2 },
+    { 0, 1, 4, 5, 2, 3 },
+    { 0, 1, 5, 6, 2, 4, 3 },
+    { 0, 1, 6, 7, 4, 5, 2, 3 },
+};
+
+static void libopus_write_header(AVCodecContext *avctx, int stream_count,
+                                 int coupled_stream_count,
+                                 int mapping_family,
+                                 const uint8_t *channel_mapping)
+{
+    uint8_t *p   = avctx->extradata;
+    int channels = avctx->ch_layout.nb_channels;
+
+    bytestream_put_buffer(&p, "OpusHead", 8);
+    bytestream_put_byte(&p, 1); /* Version */
+    bytestream_put_byte(&p, channels);
+    bytestream_put_le16(&p, avctx->initial_padding * 48000 / avctx->sample_rate); /* Lookahead samples at 48kHz */
+    bytestream_put_le32(&p, avctx->sample_rate); /* Original sample rate */
+    bytestream_put_le16(&p, 0); /* Gain of 0dB is recommended. */
+
+    /* Channel mapping */
+    bytestream_put_byte(&p, mapping_family);
+    if (mapping_family != 0) {
+        bytestream_put_byte(&p, stream_count);
+        bytestream_put_byte(&p, coupled_stream_count);
+        bytestream_put_buffer(&p, channel_mapping, channels);
+    }
+}
+
+static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
+                                     LibopusEncOpts *opts)
+{
+    int ret;
+
+    if (avctx->global_quality) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Quality-based encoding not supported, "
+               "please specify a bitrate and VBR setting.\n");
+        return AVERROR(EINVAL);
+    }
+
+    ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate));
+    if (ret != OPUS_OK) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Failed to set bitrate: %s\n", opus_strerror(ret));
+        return ret;
+    }
+
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_COMPLEXITY(opts->complexity));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set complexity: %s\n", opus_strerror(ret));
+
+    ret = opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(!!opts->vbr));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set VBR: %s\n", opus_strerror(ret));
+
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_VBR_CONSTRAINT(opts->vbr == 2));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set constrained VBR: %s\n", opus_strerror(ret));
+
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_PACKET_LOSS_PERC(opts->packet_loss));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set expected packet loss percentage: %s\n",
+               opus_strerror(ret));
+
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_INBAND_FEC(opts->fec));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set inband FEC: %s\n",
+               opus_strerror(ret));
+
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_DTX(opts->dtx));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set DTX: %s\n",
+               opus_strerror(ret));
+
+    if (avctx->cutoff) {
+        ret = opus_multistream_encoder_ctl(enc,
+                                           OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth));
+        if (ret != OPUS_OK)
+            av_log(avctx, AV_LOG_WARNING,
+                   "Unable to set maximum bandwidth: %s\n", opus_strerror(ret));
+    }
+
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    ret = opus_multistream_encoder_ctl(enc,
+                                       OPUS_SET_PHASE_INVERSION_DISABLED(!opts->apply_phase_inv));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to set phase inversion: %s\n",
+               opus_strerror(ret));
+#endif
+    return OPUS_OK;
+}
+
+static int libopus_check_max_channels(AVCodecContext *avctx,
+                                      int max_channels) {
+    if (avctx->ch_layout.nb_channels > max_channels) {
+        av_log(avctx, AV_LOG_ERROR, "Opus mapping family undefined for %d channels.\n",
+               avctx->ch_layout.nb_channels);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int libopus_check_vorbis_layout(AVCodecContext *avctx, int mapping_family) {
+    av_assert2(avctx->ch_layout.nb_channels < FF_ARRAY_ELEMS(ff_vorbis_ch_layouts));
+
+    if (avctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) {
+        av_log(avctx, AV_LOG_WARNING,
+               "No channel layout specified. Opus encoder will use Vorbis "
+               "channel layout for %d channels.\n", avctx->ch_layout.nb_channels);
+    } else if (av_channel_layout_compare(&avctx->ch_layout, &ff_vorbis_ch_layouts[avctx->ch_layout.nb_channels - 1])) {
+        char name[32];
+
+        av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name));
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid channel layout %s for specified mapping family %d.\n",
+               name, mapping_family);
+
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int libopus_validate_layout_and_get_channel_map(
+        AVCodecContext *avctx,
+        int mapping_family,
+        const uint8_t ** channel_map_result)
+{
+    const uint8_t * channel_map = NULL;
+    int ret;
+
+    switch (mapping_family) {
+    case -1:
+        ret = libopus_check_max_channels(avctx, 8);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+            /* Channels do not need to be reordered. */
+        }
+
+        break;
+    case 0:
+        ret = libopus_check_max_channels(avctx, 2);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+        }
+        break;
+    case 1:
+        /* Opus expects channels to be in Vorbis order. */
+        ret = libopus_check_max_channels(avctx, 8);
+        if (ret == 0) {
+            ret = libopus_check_vorbis_layout(avctx, mapping_family);
+            channel_map = ff_vorbis_channel_layout_offsets[avctx->ch_layout.nb_channels - 1];
+        }
+        break;
+    case 255:
+        ret = libopus_check_max_channels(avctx, 254);
+        break;
+    default:
+        av_log(avctx, AV_LOG_WARNING,
+               "Unknown channel mapping family %d. Output channel layout may be invalid.\n",
+               mapping_family);
+        ret = 0;
+    }
+
+    *channel_map_result = channel_map;
+    return ret;
+}
+
+static av_cold int libopus_encode_init(AVCodecContext *avctx)
+{
+    LibopusEncContext *opus = avctx->priv_data;
+    OpusMSEncoder *enc;
+    uint8_t libopus_channel_mapping[255];
+    int ret = OPUS_OK;
+    int channels = avctx->ch_layout.nb_channels;
+    int av_ret;
+    int coupled_stream_count, header_size, frame_size;
+    int mapping_family;
+
+    frame_size = opus->opts.frame_duration * 48000 / 1000;
+    switch (frame_size) {
+    case 120:
+    case 240:
+        if (opus->opts.application != OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+            av_log(avctx, AV_LOG_WARNING,
+                   "LPC mode cannot be used with a frame duration of less "
+                   "than 10ms. Enabling restricted low-delay mode.\n"
+                   "Use a longer frame duration if this is not what you want.\n");
+        /* Frame sizes less than 10 ms can only use MDCT mode, so switching to
+         * RESTRICTED_LOWDELAY avoids an unnecessary extra 2.5ms lookahead. */
+        opus->opts.application = OPUS_APPLICATION_RESTRICTED_LOWDELAY;
+    case 480:
+    case 960:
+    case 1920:
+    case 2880:
+#ifdef OPUS_FRAMESIZE_120_MS
+    case 3840:
+    case 4800:
+    case 5760:
+#endif
+        opus->opts.packet_size =
+        avctx->frame_size      = frame_size * avctx->sample_rate / 48000;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid frame duration: %g.\n"
+               "Frame duration must be exactly one of: 2.5, 5, 10, 20, 40"
+#ifdef OPUS_FRAMESIZE_120_MS
+               ", 60, 80, 100 or 120.\n",
+#else
+               " or 60.\n",
+#endif
+               opus->opts.frame_duration);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->compression_level < 0 || avctx->compression_level > 10) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Compression level must be in the range 0 to 10. "
+               "Defaulting to 10.\n");
+        opus->opts.complexity = 10;
+    } else {
+        opus->opts.complexity = avctx->compression_level;
+    }
+
+    if (avctx->cutoff) {
+        switch (avctx->cutoff) {
+        case  4000:
+            opus->opts.max_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
+            break;
+        case  6000:
+            opus->opts.max_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
+            break;
+        case  8000:
+            opus->opts.max_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
+            break;
+        case 12000:
+            opus->opts.max_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND;
+            break;
+        case 20000:
+            opus->opts.max_bandwidth = OPUS_BANDWIDTH_FULLBAND;
+            break;
+        default:
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid frequency cutoff: %d. Using default maximum bandwidth.\n"
+                   "Cutoff frequency must be exactly one of: 4000, 6000, 8000, 12000 or 20000.\n",
+                   avctx->cutoff);
+            avctx->cutoff = 0;
+        }
+    }
+
+    /* Channels may need to be reordered to match opus mapping. */
+    av_ret = libopus_validate_layout_and_get_channel_map(avctx, opus->opts.mapping_family,
+                                                         &opus->encoder_channel_map);
+    if (av_ret) {
+        return av_ret;
+    }
+
+    if (opus->opts.mapping_family == -1) {
+        /* By default, use mapping family 1 for the header but use the older
+         * libopus multistream API to avoid surround masking. */
+
+        /* Set the mapping family so that the value is correct in the header */
+        mapping_family = channels > 2 ? 1 : 0;
+        coupled_stream_count = opus_coupled_streams[channels - 1];
+        opus->stream_count   = channels - coupled_stream_count;
+        memcpy(libopus_channel_mapping,
+               opus_vorbis_channel_map[channels - 1],
+               channels * sizeof(*libopus_channel_mapping));
+
+        enc = opus_multistream_encoder_create(
+            avctx->sample_rate, channels, opus->stream_count,
+            coupled_stream_count,
+            libavcodec_libopus_channel_map[channels - 1],
+            opus->opts.application, &ret);
+    } else {
+        /* Use the newer multistream API. The encoder will set the channel
+         * mapping and coupled stream counts to its internal defaults and will
+         * use surround masking analysis to save bits. */
+        mapping_family = opus->opts.mapping_family;
+        enc = opus_multistream_surround_encoder_create(
+            avctx->sample_rate, channels, mapping_family,
+            &opus->stream_count, &coupled_stream_count, libopus_channel_mapping,
+            opus->opts.application, &ret);
+    }
+
+    if (ret != OPUS_OK) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Failed to create encoder: %s\n", opus_strerror(ret));
+        return ff_opus_error_to_averror(ret);
+    }
+
+    if (!avctx->bit_rate) {
+        /* Sane default copied from opusenc */
+        avctx->bit_rate = 64000 * opus->stream_count +
+                          32000 * coupled_stream_count;
+        av_log(avctx, AV_LOG_WARNING,
+               "No bit rate set. Defaulting to %"PRId64" bps.\n", avctx->bit_rate);
+    }
+
+    if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * channels) {
+        av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. "
+               "Please choose a value between 500 and %d.\n", avctx->bit_rate,
+               256000 * channels);
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    ret = libopus_configure_encoder(avctx, enc, &opus->opts);
+    if (ret != OPUS_OK) {
+        ret = ff_opus_error_to_averror(ret);
+        goto fail;
+    }
+
+    /* Header includes channel mapping table if and only if mapping family is NOT 0 */
+    header_size = 19 + (mapping_family == 0 ? 0 : 2 + channels);
+    avctx->extradata = av_malloc(header_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate extradata.\n");
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    avctx->extradata_size = header_size;
+
+    opus->samples = av_calloc(frame_size, channels *
+                               av_get_bytes_per_sample(avctx->sample_fmt));
+    if (!opus->samples) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n");
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = opus_multistream_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&avctx->initial_padding));
+    if (ret != OPUS_OK)
+        av_log(avctx, AV_LOG_WARNING,
+               "Unable to get number of lookahead samples: %s\n",
+               opus_strerror(ret));
+
+    libopus_write_header(avctx, opus->stream_count, coupled_stream_count,
+                         mapping_family, libopus_channel_mapping);
+
+    ff_af_queue_init(avctx, &opus->afq);
+
+    opus->enc = enc;
+
+    return 0;
+
+fail:
+    opus_multistream_encoder_destroy(enc);
+    return ret;
+}
+
+static void libopus_copy_samples_with_channel_map(
+    uint8_t *dst, const uint8_t *src, const uint8_t *channel_map,
+    int nb_channels, int nb_samples, int bytes_per_sample) {
+    int sample, channel;
+    for (sample = 0; sample < nb_samples; ++sample) {
+        for (channel = 0; channel < nb_channels; ++channel) {
+            const size_t src_pos = bytes_per_sample * (nb_channels * sample + channel);
+            const size_t dst_pos = bytes_per_sample * (nb_channels * sample + channel_map[channel]);
+
+            memcpy(&dst[dst_pos], &src[src_pos], bytes_per_sample);
+        }
+    }
+}
+
+static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
+                          const AVFrame *frame, int *got_packet_ptr)
+{
+    LibopusEncContext *opus = avctx->priv_data;
+    const int bytes_per_sample = av_get_bytes_per_sample(avctx->sample_fmt);
+    const int channels         = avctx->ch_layout.nb_channels;
+    const int sample_size      = channels * bytes_per_sample;
+    const uint8_t *audio;
+    int ret;
+    int discard_padding;
+
+    if (frame) {
+        ret = ff_af_queue_add(&opus->afq, frame);
+        if (ret < 0)
+            return ret;
+        if (opus->encoder_channel_map != NULL) {
+            audio = opus->samples;
+            libopus_copy_samples_with_channel_map(
+                opus->samples, frame->data[0], opus->encoder_channel_map,
+                channels, frame->nb_samples, bytes_per_sample);
+        } else if (frame->nb_samples < opus->opts.packet_size) {
+            audio = opus->samples;
+            memcpy(opus->samples, frame->data[0], frame->nb_samples * sample_size);
+        } else
+            audio = frame->data[0];
+    } else {
+        if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count))
+            return 0;
+        audio = opus->samples;
+        memset(opus->samples, 0, opus->opts.packet_size * sample_size);
+    }
+
+    /* Maximum packet size taken from opusenc in opus-tools. 120ms packets
+     * consist of 6 frames in one packet. The maximum frame size is 1275
+     * bytes along with the largest possible packet header of 7 bytes. */
+    if ((ret = ff_alloc_packet(avctx, avpkt, (1275 * 6 + 7) * opus->stream_count)) < 0)
+        return ret;
+
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT)
+        ret = opus_multistream_encode_float(opus->enc, (const float *)audio,
+                                            opus->opts.packet_size,
+                                            avpkt->data, avpkt->size);
+    else
+        ret = opus_multistream_encode(opus->enc, (const opus_int16 *)audio,
+                                      opus->opts.packet_size,
+                                      avpkt->data, avpkt->size);
+
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Error encoding frame: %s\n", opus_strerror(ret));
+        return ff_opus_error_to_averror(ret);
+    }
+
+    av_shrink_packet(avpkt, ret);
+
+    ff_af_queue_remove(&opus->afq, opus->opts.packet_size,
+                       &avpkt->pts, &avpkt->duration);
+
+    discard_padding = opus->opts.packet_size - avpkt->duration;
+    // Check if subtraction resulted in an overflow
+    if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0))
+        return AVERROR(EINVAL);
+    if (discard_padding > 0) {
+        uint8_t* side_data = av_packet_new_side_data(avpkt,
+                                                     AV_PKT_DATA_SKIP_SAMPLES,
+                                                     10);
+        if (!side_data)
+            return AVERROR(ENOMEM);
+        AV_WL32(side_data + 4, discard_padding);
+    }
+
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+static av_cold int libopus_encode_close(AVCodecContext *avctx)
+{
+    LibopusEncContext *opus = avctx->priv_data;
+
+    opus_multistream_encoder_destroy(opus->enc);
+
+    ff_af_queue_close(&opus->afq);
+
+    av_freep(&opus->samples);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(LibopusEncContext, opts.x)
+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption libopus_options[] = {
+    { "application",    "Intended application type",           OFFSET(application),    AV_OPT_TYPE_INT,   { .i64 = OPUS_APPLICATION_AUDIO }, OPUS_APPLICATION_VOIP, OPUS_APPLICATION_RESTRICTED_LOWDELAY, FLAGS, "application" },
+        { "voip",           "Favor improved speech intelligibility",   0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_VOIP },                0, 0, FLAGS, "application" },
+        { "audio",          "Favor faithfulness to the input",         0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_AUDIO },               0, 0, FLAGS, "application" },
+        { "lowdelay",       "Restrict to only the lowest delay modes", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_RESTRICTED_LOWDELAY }, 0, 0, FLAGS, "application" },
+    { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS },
+    { "packet_loss",    "Expected packet loss percentage",     OFFSET(packet_loss),    AV_OPT_TYPE_INT,   { .i64 = 0 },    0,   100,  FLAGS },
+    { "fec",             "Enable inband FEC. Expected packet loss must be non-zero",     OFFSET(fec),    AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, FLAGS },
+    { "vbr",            "Variable bit rate mode",              OFFSET(vbr),            AV_OPT_TYPE_INT,   { .i64 = 1 },    0,   2,    FLAGS, "vbr" },
+        { "off",            "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" },
+        { "on",             "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" },
+        { "constrained",    "Use constrained VBR",   0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" },
+    { "mapping_family", "Channel Mapping Family",              OFFSET(mapping_family), AV_OPT_TYPE_INT,   { .i64 = -1 },   -1,  255,  FLAGS, "mapping_family" },
+    { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+    { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+#endif
+    { NULL },
+};
+
+static const AVClass libopus_class = {
+    .class_name = "libopus",
+    .item_name  = av_default_item_name,
+    .option     = libopus_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const FFCodecDefault libopus_defaults[] = {
+    { "b",                 "0" },
+    { "compression_level", "10" },
+    { NULL },
+};
+
+static const int libopus_sample_rates[] = {
+    48000, 24000, 16000, 12000, 8000, 0,
+};
+
+const FFCodec ff_libopus_encoder = {
+    .p.name          = "libopus",
+    CODEC_LONG_NAME("libopus Opus"),
+    .p.type          = AVMEDIA_TYPE_AUDIO,
+    .p.id            = AV_CODEC_ID_OPUS,
+    .p.capabilities  = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+                       AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .caps_internal   = FF_CODEC_CAP_NOT_INIT_THREADSAFE,
+    .priv_data_size  = sizeof(LibopusEncContext),
+    .init            = libopus_encode_init,
+    FF_CODEC_ENCODE_CB(libopus_encode),
+    .close           = libopus_encode_close,
+    .p.sample_fmts   = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                      AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
+    .p.supported_samplerates = libopus_sample_rates,
+    .p.priv_class    = &libopus_class,
+    .defaults        = libopus_defaults,
+    .p.wrapper_name  = "libopus",
+};
diff --git a/media/ffvpx/libavcodec/libvorbisenc.c b/media/ffvpx/libavcodec/libvorbisenc.c
new file mode 100644
index 0000000000..6331cf0d79
--- /dev/null
+++ b/media/ffvpx/libavcodec/libvorbisenc.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vorbis/vorbisenc.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/fifo.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "audio_frame_queue.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "version.h"
+#include "vorbis_parser.h"
+
+
+/* Number of samples the user should send in each call.
+ * This value is used because it is the LCD of all possible frame sizes, so
+ * an output packet will always start at the same point as one of the input
+ * packets.
+ */
+#define LIBVORBIS_FRAME_SIZE 64
+
+#define BUFFER_SIZE (1024 * 64)
+
+typedef struct LibvorbisEncContext {
+    AVClass *av_class;                  /**< class for AVOptions            */
+    vorbis_info vi;                     /**< vorbis_info used during init   */
+    vorbis_dsp_state vd;                /**< DSP state used for analysis    */
+    vorbis_block vb;                    /**< vorbis_block used for analysis */
+    AVFifo *pkt_fifo;                   /**< output packet buffer           */
+    int eof;                            /**< end-of-file flag               */
+    int dsp_initialized;                /**< vd has been initialized        */
+    vorbis_comment vc;                  /**< VorbisComment info             */
+    double iblock;                      /**< impulse block bias option      */
+    AVVorbisParseContext *vp;           /**< parse context to get durations */
+    AudioFrameQueue afq;                /**< frame queue for timestamps     */
+} LibvorbisEncContext;
+
+static const AVOption options[] = {
+    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { NULL }
+};
+
+static const FFCodecDefault defaults[] = {
+    { "b",  "0" },
+    { NULL },
+};
+
+static const AVClass vorbis_class = {
+    .class_name = "libvorbis",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const uint8_t vorbis_encoding_channel_layout_offsets[8][8] = {
+    { 0 },
+    { 0, 1 },
+    { 0, 2, 1 },
+    { 0, 1, 2, 3 },
+    { 0, 2, 1, 3, 4 },
+    { 0, 2, 1, 4, 5, 3 },
+    { 0, 2, 1, 5, 6, 4, 3 },
+    { 0, 2, 1, 6, 7, 4, 5, 3 },
+};
+
+static int vorbis_error_to_averror(int ov_err)
+{
+    switch (ov_err) {
+    case OV_EFAULT: return AVERROR_BUG;
+    case OV_EINVAL: return AVERROR(EINVAL);
+    case OV_EIMPL:  return AVERROR(EINVAL);
+    default:        return AVERROR_UNKNOWN;
+    }
+}
+
+static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
+{
+    LibvorbisEncContext *s = avctx->priv_data;
+    int channels = avctx->ch_layout.nb_channels;
+    double cfreq;
+    int ret;
+
+    if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) {
+        /* variable bitrate
+         * NOTE: we use the oggenc range of -1 to 10 for global_quality for
+         *       user convenience, but libvorbis uses -0.1 to 1.0.
+         */
+        float q = avctx->global_quality / (float)FF_QP2LAMBDA;
+        /* default to 3 if the user did not set quality or bitrate */
+        if (!(avctx->flags & AV_CODEC_FLAG_QSCALE))
+            q = 3.0;
+        if ((ret = vorbis_encode_setup_vbr(vi, channels,
+                                           avctx->sample_rate,
+                                           q / 10.0)))
+            goto error;
+    } else {
+        int minrate = avctx->rc_min_rate > 0 ? avctx->rc_min_rate : -1;
+        int maxrate = avctx->rc_max_rate > 0 ? avctx->rc_max_rate : -1;
+
+        /* average bitrate */
+        if ((ret = vorbis_encode_setup_managed(vi, channels,
+                                               avctx->sample_rate, maxrate,
+                                               avctx->bit_rate, minrate)))
+            goto error;
+
+        /* variable bitrate by estimate, disable slow rate management */
+        if (minrate == -1 && maxrate == -1)
+            if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL)))
+                goto error; /* should not happen */
+    }
+
+    /* cutoff frequency */
+    if (avctx->cutoff > 0) {
+        cfreq = avctx->cutoff / 1000.0;
+        if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq)))
+            goto error; /* should not happen */
+    }
+
+    /* impulse block bias */
+    if (s->iblock) {
+        if ((ret = vorbis_encode_ctl(vi, OV_ECTL_IBLOCK_SET, &s->iblock)))
+            goto error;
+    }
+
+    if ((channels == 3 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_SURROUND)) ||
+        (channels == 4 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_2_2) &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_QUAD)) ||
+        (channels == 5 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0) &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0_BACK)) ||
+        (channels == 6 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1) &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1_BACK)) ||
+        (channels == 7 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_6POINT1)) ||
+        (channels == 8 &&
+         av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_7POINT1))) {
+        if (avctx->ch_layout.order != AV_CHANNEL_ORDER_UNSPEC) {
+            char name[32];
+            av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name));
+            av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n", name);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Vorbis channel layout for "
+                                               "%d channels.\n", channels);
+        }
+    }
+
+    if ((ret = vorbis_encode_setup_init(vi)))
+        goto error;
+
+    return 0;
+error:
+    return vorbis_error_to_averror(ret);
+}
+
+/* How many bytes are needed for a buffer of length 'l' */
+static int xiph_len(int l)
+{
+    return 1 + l / 255 + l;
+}
+
+static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
+{
+    LibvorbisEncContext *s = avctx->priv_data;
+
+    /* notify vorbisenc this is EOF */
+    if (s->dsp_initialized)
+        vorbis_analysis_wrote(&s->vd, 0);
+
+    vorbis_block_clear(&s->vb);
+    vorbis_dsp_clear(&s->vd);
+    vorbis_info_clear(&s->vi);
+
+    av_fifo_freep2(&s->pkt_fifo);
+    ff_af_queue_close(&s->afq);
+
+    av_vorbis_parse_free(&s->vp);
+
+    return 0;
+}
+
+static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
+{
+    LibvorbisEncContext *s = avctx->priv_data;
+    ogg_packet header, header_comm, header_code;
+    uint8_t *p;
+    unsigned int offset;
+    int ret;
+
+    vorbis_info_init(&s->vi);
+    if ((ret = libvorbis_setup(&s->vi, avctx))) {
+        av_log(avctx, AV_LOG_ERROR, "encoder setup failed\n");
+        goto error;
+    }
+    if ((ret = vorbis_analysis_init(&s->vd, &s->vi))) {
+        av_log(avctx, AV_LOG_ERROR, "analysis init failed\n");
+        ret = vorbis_error_to_averror(ret);
+        goto error;
+    }
+    s->dsp_initialized = 1;
+    if ((ret = vorbis_block_init(&s->vd, &s->vb))) {
+        av_log(avctx, AV_LOG_ERROR, "dsp init failed\n");
+        ret = vorbis_error_to_averror(ret);
+        goto error;
+    }
+
+    vorbis_comment_init(&s->vc);
+    if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
+
+    if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm,
+                                         &header_code))) {
+        ret = vorbis_error_to_averror(ret);
+        goto error;
+    }
+
+    avctx->extradata_size = 1 + xiph_len(header.bytes)      +
+                                xiph_len(header_comm.bytes) +
+                                header_code.bytes;
+    p = avctx->extradata = av_malloc(avctx->extradata_size +
+                                     AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!p) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+    p[0]    = 2;
+    offset  = 1;
+    offset += av_xiphlacing(&p[offset], header.bytes);
+    offset += av_xiphlacing(&p[offset], header_comm.bytes);
+    memcpy(&p[offset], header.packet, header.bytes);
+    offset += header.bytes;
+    memcpy(&p[offset], header_comm.packet, header_comm.bytes);
+    offset += header_comm.bytes;
+    memcpy(&p[offset], header_code.packet, header_code.bytes);
+    offset += header_code.bytes;
+    av_assert0(offset == avctx->extradata_size);
+
+    s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size);
+    if (!s->vp) {
+        av_log(avctx, AV_LOG_ERROR, "invalid extradata\n");
+        return ret;
+    }
+
+    vorbis_comment_clear(&s->vc);
+
+    avctx->frame_size = LIBVORBIS_FRAME_SIZE;
+    ff_af_queue_init(avctx, &s->afq);
+
+    s->pkt_fifo = av_fifo_alloc2(BUFFER_SIZE, 1, 0);
+    if (!s->pkt_fifo) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    return 0;
+error:
+    libvorbis_encode_close(avctx);
+    return ret;
+}
+
+static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                  const AVFrame *frame, int *got_packet_ptr)
+{
+    LibvorbisEncContext *s = avctx->priv_data;
+    ogg_packet op;
+    int ret, duration;
+
+    /* send samples to libvorbis */
+    if (frame) {
+        const int samples = frame->nb_samples;
+        float **buffer;
+        int c, channels = s->vi.channels;
+
+        buffer = vorbis_analysis_buffer(&s->vd, samples);
+        for (c = 0; c < channels; c++) {
+            int co = (channels > 8) ? c :
+                     vorbis_encoding_channel_layout_offsets[channels - 1][c];
+            memcpy(buffer[c], frame->extended_data[co],
+                   samples * sizeof(*buffer[c]));
+        }
+        if ((ret = vorbis_analysis_wrote(&s->vd, samples)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
+            return vorbis_error_to_averror(ret);
+        }
+        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+            return ret;
+    } else {
+        if (!s->eof && s->afq.frame_alloc)
+            if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
+                return vorbis_error_to_averror(ret);
+            }
+        s->eof = 1;
+    }
+
+    /* retrieve available packets from libvorbis */
+    while ((ret = vorbis_analysis_blockout(&s->vd, &s->vb)) == 1) {
+        if ((ret = vorbis_analysis(&s->vb, NULL)) < 0)
+            break;
+        if ((ret = vorbis_bitrate_addblock(&s->vb)) < 0)
+            break;
+
+        /* add any available packets to the output packet buffer */
+        while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) {
+            if (av_fifo_can_write(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) {
+                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n");
+                return AVERROR_BUG;
+            }
+            av_fifo_write(s->pkt_fifo, &op, sizeof(ogg_packet));
+            av_fifo_write(s->pkt_fifo, op.packet, op.bytes);
+        }
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "error getting available packets\n");
+            break;
+        }
+    }
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error getting available packets\n");
+        return vorbis_error_to_averror(ret);
+    }
+
+    /* Read an available packet if possible */
+    if (av_fifo_read(s->pkt_fifo, &op, sizeof(ogg_packet)) < 0)
+        return 0;
+
+    if ((ret = ff_get_encode_buffer(avctx, avpkt, op.bytes, 0)) < 0)
+        return ret;
+    av_fifo_read(s->pkt_fifo, avpkt->data, op.bytes);
+
+    avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos);
+
+    duration = av_vorbis_parse_frame(s->vp, avpkt->data, avpkt->size);
+    if (duration > 0) {
+        /* we do not know encoder delay until we get the first packet from
+         * libvorbis, so we have to update the AudioFrameQueue counts */
+        if (!avctx->initial_padding && s->afq.frames) {
+            avctx->initial_padding    = duration;
+            av_assert0(!s->afq.remaining_delay);
+            s->afq.frames->duration  += duration;
+            if (s->afq.frames->pts != AV_NOPTS_VALUE)
+                s->afq.frames->pts       -= duration;
+            s->afq.remaining_samples += duration;
+        }
+        ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration);
+    }
+
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+const FFCodec ff_libvorbis_encoder = {
+    .p.name         = "libvorbis",
+    CODEC_LONG_NAME("libvorbis"),
+    .p.type         = AVMEDIA_TYPE_AUDIO,
+    .p.id           = AV_CODEC_ID_VORBIS,
+    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+                      AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE,
+    .priv_data_size = sizeof(LibvorbisEncContext),
+    .init           = libvorbis_encode_init,
+    FF_CODEC_ENCODE_CB(libvorbis_encode_frame),
+    .close          = libvorbis_encode_close,
+    .p.sample_fmts  = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .p.priv_class   = &vorbis_class,
+    .defaults       = defaults,
+    .p.wrapper_name = "libvorbis",
+};
diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build
index 0ba603d172..886fa7a2cb 100644
--- a/media/ffvpx/libavcodec/moz.build
+++ b/media/ffvpx/libavcodec/moz.build
@@ -20,6 +20,7 @@ LOCAL_INCLUDES += ['/modules/fdlibm/inexact-math-override']
 SharedLibrary('mozavcodec')
 SOURCES += [
     'allcodecs.c',
+    'audio_frame_queue.c',
     'avcodec.c',
     'avdct.c',
     'avfft.c',
@@ -47,7 +48,9 @@ SOURCES += [
     'jrevdct.c',
     'libopus.c',
     'libopusdec.c',
+    'libopusenc.c',
     'libvorbisdec.c',
+    'libvorbisenc.c',
     'log2_tab.c',
     'mpegaudio.c',
     'mpegaudiodata.c',
diff --git a/media/ffvpx/libavutil/avutil.symbols b/media/ffvpx/libavutil/avutil.symbols
index 0ad6fad9cd..5ee7afb855 100644
--- a/media/ffvpx/libavutil/avutil.symbols
+++ b/media/ffvpx/libavutil/avutil.symbols
@@ -92,6 +92,7 @@ av_fifo_alloc
 av_fifo_alloc2
 av_fifo_alloc_array
 av_fifo_can_read
+av_fifo_can_write
 av_fifo_drain
 av_fifo_drain2
 av_fifo_free
diff --git a/media/ffvpx/opusenc-dtx.patch b/media/ffvpx/opusenc-dtx.patch
new file mode 100644
index 0000000000..bf9fc9de87
--- /dev/null
+++ b/media/ffvpx/opusenc-dtx.patch
@@ -0,0 +1,63 @@
+diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c
+--- a/media/ffvpx/libavcodec/libopusenc.c
++++ b/media/ffvpx/libavcodec/libopusenc.c
+@@ -37,16 +37,17 @@ typedef struct LibopusEncOpts {
+     int application;
+     int packet_loss;
+     int fec;
+     int complexity;
+     float frame_duration;
+     int packet_size;
+     int max_bandwidth;
+     int mapping_family;
++    int dtx;
+ #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+     int apply_phase_inv;
+ #endif
+ } LibopusEncOpts;
+ 
+ typedef struct LibopusEncContext {
+     AVClass *class;
+     OpusMSEncoder *enc;
+@@ -154,16 +155,23 @@ static int libopus_configure_encoder(AVC
+ 
+     ret = opus_multistream_encoder_ctl(enc,
+                                        OPUS_SET_INBAND_FEC(opts->fec));
+     if (ret != OPUS_OK)
+         av_log(avctx, AV_LOG_WARNING,
+                "Unable to set inband FEC: %s\n",
+                opus_strerror(ret));
+ 
++    ret = opus_multistream_encoder_ctl(enc,
++                                       OPUS_SET_DTX(opts->dtx));
++    if (ret != OPUS_OK)
++        av_log(avctx, AV_LOG_WARNING,
++               "Unable to set DTX: %s\n",
++               opus_strerror(ret));
++
+     if (avctx->cutoff) {
+         ret = opus_multistream_encoder_ctl(enc,
+                                            OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth));
+         if (ret != OPUS_OK)
+             av_log(avctx, AV_LOG_WARNING,
+                    "Unable to set maximum bandwidth: %s\n", opus_strerror(ret));
+     }
+ 
+@@ -551,16 +559,17 @@ static const AVOption libopus_options[] 
+     { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS },
+     { "packet_loss",    "Expected packet loss percentage",     OFFSET(packet_loss),    AV_OPT_TYPE_INT,   { .i64 = 0 },    0,   100,  FLAGS },
+     { "fec",             "Enable inband FEC. Expected packet loss must be non-zero",     OFFSET(fec),    AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, FLAGS },
+     { "vbr",            "Variable bit rate mode",              OFFSET(vbr),            AV_OPT_TYPE_INT,   { .i64 = 1 },    0,   2,    FLAGS, "vbr" },
+         { "off",            "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" },
+         { "on",             "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" },
+         { "constrained",    "Use constrained VBR",   0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" },
+     { "mapping_family", "Channel Mapping Family",              OFFSET(mapping_family), AV_OPT_TYPE_INT,   { .i64 = -1 },   -1,  255,  FLAGS, "mapping_family" },
++    { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+ #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+     { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+ #endif
+     { NULL },
+ };
+ 
+ static const AVClass libopus_class = {
+     .class_name = "libopus",
diff --git a/media/libaom/0001-errno.patch b/media/libaom/0001-errno.patch
new file mode 100644
index 0000000000..6040c42e38
--- /dev/null
+++ b/media/libaom/0001-errno.patch
@@ -0,0 +1,22 @@
+diff --git a/aom_util/aom_pthread.h b/aom/aom_util/aom_pthread.h
+--- a/aom_util/aom_pthread.h
++++ b/aom_util/aom_pthread.h
+@@ -30,16 +30,18 @@ extern "C" {
+ #define WIN32_LEAN_AND_MEAN
+ #include <process.h>  // NOLINT
+ #include <stddef.h>   // NOLINT
+ #include <windows.h>  // NOLINT
+ typedef HANDLE pthread_t;
+ typedef int pthread_attr_t;
+ typedef CRITICAL_SECTION pthread_mutex_t;
+ 
++#include <errno.h>
++
+ #if _WIN32_WINNT < 0x0600
+ #error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+ #endif
+ typedef CONDITION_VARIABLE pthread_cond_t;
+ 
+ #ifndef WINAPI_FAMILY_PARTITION
+ #define WINAPI_PARTITION_DESKTOP 1
+ #define WINAPI_FAMILY_PARTITION(x) x
diff --git a/media/libaom/0002-mmloadusi64.patch b/media/libaom/0002-mmloadusi64.patch
new file mode 100644
index 0000000000..9d23c90f22
--- /dev/null
+++ b/media/libaom/0002-mmloadusi64.patch
@@ -0,0 +1,79 @@
+diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
+--- a/aom_dsp/x86/synonyms.h
++++ b/aom_dsp/x86/synonyms.h
+@@ -41,22 +41,35 @@ static INLINE __m128i xx_loadl_64(const 
+ static INLINE __m128i xx_load_128(const void *a) {
+   return _mm_load_si128((const __m128i *)a);
+ }
+ 
+ static INLINE __m128i xx_loadu_128(const void *a) {
+   return _mm_loadu_si128((const __m128i *)a);
+ }
+ 
++
++// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
++// manually on older compilers.
++#if !defined(__clang__) && __GNUC_MAJOR__ < 9
++static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
++  __m64 hi_, lo_;
++  memcpy(&hi_, hi, sizeof(hi_));
++  memcpy(&lo_, lo, sizeof(lo_));
++  return _mm_set_epi64(hi_, lo_);
++}
++#endif
++#else
+ // Load 64 bits from each of hi and low, and pack into an SSE register
+ // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
+ // the strict aliasing rule, this takes a different approach
+ static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+   return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+ }
++#endif
+ 
+ static INLINE void xx_storel_32(void *const a, const __m128i v) {
+   const int val = _mm_cvtsi128_si32(v);
+   memcpy(a, &val, sizeof(val));
+ }
+ 
+ static INLINE void xx_storel_64(void *const a, const __m128i v) {
+   _mm_storel_epi64((__m128i *)a, v);
+diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
+--- a/aom_dsp/x86/synonyms_avx2.h
++++ b/aom_dsp/x86/synonyms_avx2.h
+@@ -66,21 +66,36 @@ static INLINE __m256i yy_set1_64_from_32
+ 
+ // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+ // therefore define an equivalent function using a different intrinsic.
+ // ([ hi ], [ lo ]) -> [ hi ][ lo ]
+ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+ }
+ 
++#define GCC_VERSION (__GNUC__ * 10000 \
++                     + __GNUC_MINOR__ * 100 \
++                     + __GNUC_PATCHLEVEL__)
++
++// _mm256_loadu2_m128i has been introduced in GCC 10.1
++#if !defined(__clang__) && GCC_VERSION < 101000
++static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
++  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
++  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
++  return _mm256_set_m128i(mhi, mlo);
++}
++#else
+ static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+   return yy_set_m128i(mhi, mlo);
+ }
++#endif
++
++#undef GCC_VERSION
+ 
+ static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+   _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+   _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+ }
+ 
+ static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+   const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm
index be0715562c..0f329a7df5 100644
--- a/media/libaom/config/generic/config/aom_config.asm
+++ b/media/libaom/config/generic/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 0
 HAVE_SSE4_2 equ 0
 HAVE_SSSE3 equ 0
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h
index a695b0b3e6..c89e1d755c 100644
--- a/media/libaom/config/generic/config/aom_config.h
+++ b/media/libaom/config/generic/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 0
 #define HAVE_SSSE3 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/generic/config/aom_dsp_rtcd.h b/media/libaom/config/generic/config/aom_dsp_rtcd.h
index 0418b3568e..a61dc47a47 100644
--- a/media/libaom/config/generic/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/generic/config/aom_dsp_rtcd.h
@@ -46,9 +46,15 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 #define aom_comp_mask_pred aom_comp_mask_pred_c
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+#define aom_compute_correlation aom_compute_correlation_c
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 #define aom_compute_flow_at_point aom_compute_flow_at_point_c
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+#define aom_compute_mean_stddev aom_compute_mean_stddev_c
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
@@ -4693,9 +4699,6 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
 int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
 #define aom_vector_var aom_vector_var_c
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/media/libaom/config/generic/config/aom_scale_rtcd.h b/media/libaom/config/generic/config/aom_scale_rtcd.h
index 733b2d9ea1..dd09c4e3a6 100644
--- a/media/libaom/config/generic/config/aom_scale_rtcd.h
+++ b/media/libaom/config/generic/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm
index 63034fd7e2..1ec673f263 100644
--- a/media/libaom/config/linux/arm/config/aom_config.asm
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@@ -53,6 +53,7 @@
 .equ CONFIG_OUTPUT_FRAME_SIZE, 0
 .equ CONFIG_PARTITION_SEARCH_ORDER, 0
 .equ CONFIG_PIC, 1
+.equ CONFIG_QUANT_MATRIX, 1
 .equ CONFIG_RATECTRL_LOG, 0
 .equ CONFIG_RD_COMMAND, 0
 .equ CONFIG_RD_DEBUG, 0
@@ -87,6 +88,7 @@
 .equ HAVE_SSE4_2, 0
 .equ HAVE_SSSE3, 0
 .equ HAVE_SVE, 0
+.equ HAVE_SVE2, 0
 .equ HAVE_VSX, 0
 .equ HAVE_WXWIDGETS, 0
 .equ STATIC_LINK_JXL, 0
diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h
index 3cbe7bf169..fb73e8431e 100644
--- a/media/libaom/config/linux/arm/config/aom_config.h
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 1
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 0
 #define HAVE_SSSE3 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
index 50ee78932c..fffcc5a3e9 100644
--- a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
@@ -54,10 +54,16 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, in
 void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+#define aom_compute_correlation aom_compute_correlation_c
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+#define aom_compute_mean_stddev aom_compute_mean_stddev_c
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
@@ -6212,9 +6218,6 @@ int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
 void aom_dsp_rtcd(void);
 
 #include "config/aom_config.h"
diff --git a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
index d296957f84..1024a666fe 100644
--- a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm
index e75260cb09..4fd596e34b 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.asm
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 1
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
 HAVE_SSE4_2 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h
index b0e5b5cabc..256f556662 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.h
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 1
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 1
 #define HAVE_SSSE3 1
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
index a19adf5f61..93472f0e92 100644
--- a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
 void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
 
 unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
@@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad_skip_16x4 aom_sad_skip_16x4_c
 
 void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
@@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
@@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void)
     aom_comp_mask_pred = aom_comp_mask_pred_c;
     if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
     if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+    aom_compute_correlation = aom_compute_correlation_c;
+    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
     aom_compute_flow_at_point = aom_compute_flow_at_point_c;
     if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
     aom_convolve8_horiz = aom_convolve8_horiz_c;
-    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
     aom_convolve8_vert = aom_convolve8_vert_c;
-    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
     if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
     aom_convolve_copy = aom_convolve_copy_c;
@@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
     aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c;
     if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
     aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c;
     if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2;
     aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
@@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2;
     aom_sad16x4_avg = aom_sad16x4_avg_c;
     if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2;
+    aom_sad16x4x3d = aom_sad16x4x3d_c;
+    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
     aom_sad16x4x4d = aom_sad16x4x4d_c;
     if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
@@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void)
     aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c;
     if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+    aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+    if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
     aom_sad_skip_16x64 = aom_sad_skip_16x64_c;
     if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2;
     aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c;
@@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
     if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
     aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
     aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
     aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
     aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
     aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
     aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
     aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
     aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
     aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
     aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
     aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
     aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
     aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
     aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
     aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
     aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
     aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
     aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
     aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
     aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
     aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
     aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
     aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
     aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
     aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
     aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
     aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
     aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
     aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
     aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
     aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
     aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
     aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
     aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
     aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
     aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
     aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
     aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
     aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
     aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
     aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
     aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
     aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
     aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
     aom_subtract_block = aom_subtract_block_c;
     if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2;
@@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void)
     aom_vector_var = aom_vector_var_c;
     if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
     if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
-    av1_compute_cross_correlation = av1_compute_cross_correlation_c;
-    if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
-    if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
 }
 #endif
 
diff --git a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/ia32/config/av1_rtcd.h b/media/libaom/config/linux/ia32/config/av1_rtcd.h
index 3f404f61c8..37716517bf 100644
--- a/media/libaom/config/linux/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h
@@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
 #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 
 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -969,7 +956,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
     if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
     av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
-    if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
     if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
     if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
     av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c;
@@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
     if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
     cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
     if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
     cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
-    if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
     if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
     cdef_filter_16_0 = cdef_filter_16_0_c;
-    if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
     cdef_filter_16_1 = cdef_filter_16_1_c;
-    if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
     cdef_filter_16_2 = cdef_filter_16_2_c;
-    if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
     cdef_filter_16_3 = cdef_filter_16_3_c;
-    if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
     cdef_filter_8_0 = cdef_filter_8_0_c;
-    if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
     cdef_filter_8_1 = cdef_filter_8_1_c;
-    if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
     cdef_filter_8_2 = cdef_filter_8_2_c;
-    if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
     cdef_filter_8_3 = cdef_filter_8_3_c;
-    if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
     cdef_find_dir = cdef_find_dir_c;
-    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
     if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
     if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
     cdef_find_dir_dual = cdef_find_dir_dual_c;
-    if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2;
     if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
     if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/linux/x64/config/aom_config.asm
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
 HAVE_SSE4_2 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/linux/x64/config/aom_config.h
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 1
 #define HAVE_SSSE3 1
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
 void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
 
 unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 #define aom_sad16x4_avg aom_sad16x4_avg_sse2
 
 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad_skip_16x4 aom_sad_skip_16x4_c
 
 void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
     aom_comp_mask_pred = aom_comp_mask_pred_c;
     if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
     if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+    aom_compute_correlation = aom_compute_correlation_c;
+    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
     aom_compute_flow_at_point = aom_compute_flow_at_point_c;
     if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
-    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
     if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
-    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
     if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
     aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
     aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+    aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
     aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
     if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
     aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
     aom_sad16x32x4d = aom_sad16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+    aom_sad16x4x3d = aom_sad16x4x3d_c;
+    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
     aom_sad16x4x4d = aom_sad16x4x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
     aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
     aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+    aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+    if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
     aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
     aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
     aom_sse = aom_sse_c;
     if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
     if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
-    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
-    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
-    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
-    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
-    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
-    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
-    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
-    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
-    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
-    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
-    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
-    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
-    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
-    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
-    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
-    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
-    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
-    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
-    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
-    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
-    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
-    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
-    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
-    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
-    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
-    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
-    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
-    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
-    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
-    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
-    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
-    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
-    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
-    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
-    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
-    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
-    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
-    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
-    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
-    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
-    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
-    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
-    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
-    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
     aom_subtract_block = aom_subtract_block_sse2;
     if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
     aom_vector_var = aom_vector_var_c;
     if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
     if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
-    av1_compute_cross_correlation = av1_compute_cross_correlation_c;
-    if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
-    if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
 }
 #endif
 
diff --git a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/x64/config/av1_rtcd.h b/media/libaom/config/linux/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/linux/x64/config/av1_rtcd.h
+++ b/media/libaom/config/linux/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
 #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 
 void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 
 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
     av1_convolve_y_sr = av1_convolve_y_sr_sse2;
     if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
-    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
     if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
     if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
     av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
     av1_inv_txfm_add = av1_inv_txfm_add_c;
     if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
     if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
-    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
     if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
     if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
     av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
     av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
     if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
-    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
-    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
-    cdef_filter_16_0 = cdef_filter_16_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+    cdef_filter_16_0 = cdef_filter_16_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
-    cdef_filter_16_1 = cdef_filter_16_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+    cdef_filter_16_1 = cdef_filter_16_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
-    cdef_filter_16_2 = cdef_filter_16_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+    cdef_filter_16_2 = cdef_filter_16_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
-    cdef_filter_16_3 = cdef_filter_16_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+    cdef_filter_16_3 = cdef_filter_16_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
-    cdef_filter_8_0 = cdef_filter_8_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+    cdef_filter_8_0 = cdef_filter_8_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
-    cdef_filter_8_1 = cdef_filter_8_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+    cdef_filter_8_1 = cdef_filter_8_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
-    cdef_filter_8_2 = cdef_filter_8_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+    cdef_filter_8_2 = cdef_filter_8_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
-    cdef_filter_8_3 = cdef_filter_8_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+    cdef_filter_8_3 = cdef_filter_8_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
-    cdef_find_dir = cdef_find_dir_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    cdef_find_dir = cdef_find_dir_c;
     if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
-    cdef_find_dir_dual = cdef_find_dir_dual_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+    cdef_find_dir_dual = cdef_find_dir_dual_c;
     if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
     cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/mac/x64/config/aom_config.asm
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
 HAVE_SSE4_2 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/mac/x64/config/aom_config.h
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 1
 #define HAVE_SSSE3 1
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
 void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
 
 unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 #define aom_sad16x4_avg aom_sad16x4_avg_sse2
 
 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad_skip_16x4 aom_sad_skip_16x4_c
 
 void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
     aom_comp_mask_pred = aom_comp_mask_pred_c;
     if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
     if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+    aom_compute_correlation = aom_compute_correlation_c;
+    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
     aom_compute_flow_at_point = aom_compute_flow_at_point_c;
     if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
-    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
     if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
-    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
     if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
     aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
     aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+    aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
     aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
     if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
     aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
     aom_sad16x32x4d = aom_sad16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+    aom_sad16x4x3d = aom_sad16x4x3d_c;
+    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
     aom_sad16x4x4d = aom_sad16x4x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
     aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
     aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+    aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+    if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
     aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
     aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
     aom_sse = aom_sse_c;
     if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
     if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
-    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
-    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
-    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
-    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
-    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
-    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
-    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
-    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
-    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
-    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
-    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
-    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
-    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
-    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
-    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
-    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
-    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
-    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
-    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
-    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
-    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
-    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
-    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
-    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
-    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
-    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
-    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
-    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
-    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
-    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
-    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
-    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
-    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
-    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
-    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
-    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
-    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
-    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
-    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
-    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
-    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
-    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
-    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
-    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
     aom_subtract_block = aom_subtract_block_sse2;
     if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
     aom_vector_var = aom_vector_var_c;
     if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
     if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
-    av1_compute_cross_correlation = av1_compute_cross_correlation_c;
-    if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
-    if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
 }
 #endif
 
diff --git a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/mac/x64/config/av1_rtcd.h b/media/libaom/config/mac/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/mac/x64/config/av1_rtcd.h
+++ b/media/libaom/config/mac/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
 #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 
 void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 
 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
     av1_convolve_y_sr = av1_convolve_y_sr_sse2;
     if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
-    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
     if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
     if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
     av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
     av1_inv_txfm_add = av1_inv_txfm_add_c;
     if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
     if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
-    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
     if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
     if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
     av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
     av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
     if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
-    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
-    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
-    cdef_filter_16_0 = cdef_filter_16_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+    cdef_filter_16_0 = cdef_filter_16_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
-    cdef_filter_16_1 = cdef_filter_16_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+    cdef_filter_16_1 = cdef_filter_16_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
-    cdef_filter_16_2 = cdef_filter_16_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+    cdef_filter_16_2 = cdef_filter_16_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
-    cdef_filter_16_3 = cdef_filter_16_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+    cdef_filter_16_3 = cdef_filter_16_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
-    cdef_filter_8_0 = cdef_filter_8_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+    cdef_filter_8_0 = cdef_filter_8_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
-    cdef_filter_8_1 = cdef_filter_8_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+    cdef_filter_8_1 = cdef_filter_8_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
-    cdef_filter_8_2 = cdef_filter_8_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+    cdef_filter_8_2 = cdef_filter_8_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
-    cdef_filter_8_3 = cdef_filter_8_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+    cdef_filter_8_3 = cdef_filter_8_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
-    cdef_find_dir = cdef_find_dir_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    cdef_find_dir = cdef_find_dir_c;
     if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
-    cdef_find_dir_dual = cdef_find_dir_dual_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+    cdef_find_dir_dual = cdef_find_dir_dual_c;
     if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
     cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm
index af78328283..8f6c3592fa 100644
--- a/media/libaom/config/win/ia32/config/aom_config.asm
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
 HAVE_SSE4_2 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h
index dba805b1b6..7d1ce61373 100644
--- a/media/libaom/config/win/ia32/config/aom_config.h
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 1
 #define HAVE_SSSE3 1
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
index a19adf5f61..93472f0e92 100644
--- a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
 void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
 
 unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
@@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad_skip_16x4 aom_sad_skip_16x4_c
 
 void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
@@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
@@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void)
     aom_comp_mask_pred = aom_comp_mask_pred_c;
     if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
     if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+    aom_compute_correlation = aom_compute_correlation_c;
+    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
     aom_compute_flow_at_point = aom_compute_flow_at_point_c;
     if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
     aom_convolve8_horiz = aom_convolve8_horiz_c;
-    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
     aom_convolve8_vert = aom_convolve8_vert_c;
-    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
     if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
     aom_convolve_copy = aom_convolve_copy_c;
@@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
     aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c;
     if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
     aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c;
     if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2;
     aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
@@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2;
     aom_sad16x4_avg = aom_sad16x4_avg_c;
     if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2;
+    aom_sad16x4x3d = aom_sad16x4x3d_c;
+    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
     aom_sad16x4x4d = aom_sad16x4x4d_c;
     if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
@@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void)
     aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c;
     if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+    aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+    if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
     aom_sad_skip_16x64 = aom_sad_skip_16x64_c;
     if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2;
     aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c;
@@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
     if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
     aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
     aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
     aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
     aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
     aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
     aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
     aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
     aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
     aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
     aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
     aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
     aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
     aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
     aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
     aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
     aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
     aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
     aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
     aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
     aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
     aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
     aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
     aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
     aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
     aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
     aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
     aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
     aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
     aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
     aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
     aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
     aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
     aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
     aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
     aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
     aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
     aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
     aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
     aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
     aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
     aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
     aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
     aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
     aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
-    if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
     aom_subtract_block = aom_subtract_block_c;
     if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2;
@@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void)
     aom_vector_var = aom_vector_var_c;
     if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
     if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
-    av1_compute_cross_correlation = av1_compute_cross_correlation_c;
-    if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
-    if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
 }
 #endif
 
diff --git a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
+++ b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/win/ia32/config/av1_rtcd.h b/media/libaom/config/win/ia32/config/av1_rtcd.h
index 3f404f61c8..37716517bf 100644
--- a/media/libaom/config/win/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/win/ia32/config/av1_rtcd.h
@@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
 #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 
 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -969,7 +956,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
     if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
     av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
-    if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
     if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
     if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
     av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c;
@@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
     if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
     cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
-    if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
     if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
     cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
-    if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
     if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
     cdef_filter_16_0 = cdef_filter_16_0_c;
-    if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
     cdef_filter_16_1 = cdef_filter_16_1_c;
-    if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
     cdef_filter_16_2 = cdef_filter_16_2_c;
-    if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
     cdef_filter_16_3 = cdef_filter_16_3_c;
-    if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2;
     if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
     cdef_filter_8_0 = cdef_filter_8_0_c;
-    if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
     cdef_filter_8_1 = cdef_filter_8_1_c;
-    if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
     cdef_filter_8_2 = cdef_filter_8_2_c;
-    if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
     cdef_filter_8_3 = cdef_filter_8_3_c;
-    if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2;
     if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
     if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
     cdef_find_dir = cdef_find_dir_c;
-    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
     if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
     if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
     cdef_find_dir_dual = cdef_find_dir_dual_c;
-    if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2;
     if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
     if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/win/x64/config/aom_config.asm
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
 CONFIG_OUTPUT_FRAME_SIZE equ 0
 CONFIG_PARTITION_SEARCH_ORDER equ 0
 CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
 CONFIG_RATECTRL_LOG equ 0
 CONFIG_RD_COMMAND equ 0
 CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
 HAVE_SSE4_2 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SVE equ 0
+HAVE_SVE2 equ 0
 HAVE_VSX equ 0
 HAVE_WXWIDGETS equ 0
 STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/win/x64/config/aom_config.h
+++ b/media/libaom/config/win/x64/config/aom_config.h
@@ -55,6 +55,7 @@
 #define CONFIG_OUTPUT_FRAME_SIZE 0
 #define CONFIG_PARTITION_SEARCH_ORDER 0
 #define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
 #define CONFIG_RATECTRL_LOG 0
 #define CONFIG_RD_COMMAND 0
 #define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
 #define HAVE_SSE4_2 1
 #define HAVE_SSSE3 1
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_VSX 0
 #define HAVE_WXWIDGETS 0
 #define INLINE inline
diff --git a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
 void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
 
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
 void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
 
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define aom_convolve8 aom_convolve8_c
 
 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
 
 unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 
 unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
 #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 #define aom_sad16x4_avg aom_sad16x4_avg_sse2
 
 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
 #define aom_sad_skip_16x4 aom_sad_skip_16x4_c
 
 void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
 #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
 
 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
 uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
 int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
 RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
 
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
 void aom_dsp_rtcd(void);
 
 #ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
     aom_comp_mask_pred = aom_comp_mask_pred_c;
     if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
     if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+    aom_compute_correlation = aom_compute_correlation_c;
+    if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
     aom_compute_flow_at_point = aom_compute_flow_at_point_c;
     if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
-    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+    aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+    if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+    if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
     if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
-    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
     if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
     if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
     aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
     aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+    aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+    if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
     aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
     if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
     aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
     aom_sad16x32x4d = aom_sad16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+    aom_sad16x4x3d = aom_sad16x4x3d_c;
+    if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
     aom_sad16x4x4d = aom_sad16x4x4d_sse2;
     if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
     aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
     aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+    aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+    if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
     aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
     if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
     aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
     aom_sse = aom_sse_c;
     if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
     if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
-    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+    aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
-    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+    aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
-    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+    aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
-    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+    aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
-    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+    aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
-    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+    aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
-    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+    aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
-    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+    aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
-    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+    aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
-    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+    aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
-    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+    aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
-    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+    aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
-    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+    aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
-    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+    aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
-    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+    aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
-    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+    aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
-    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+    aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
-    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+    aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
-    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+    aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
-    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+    aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
-    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+    aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
-    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+    aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
-    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+    aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
-    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+    aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
-    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+    aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
-    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+    aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
-    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+    aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
-    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+    aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
-    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+    aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
-    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+    aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
-    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+    aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
-    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+    aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
-    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+    aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
-    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+    aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
-    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+    aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
-    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+    aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
-    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+    aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
-    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+    aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
-    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+    aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
-    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+    aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
     if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
-    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+    aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
-    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+    aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
-    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+    aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
-    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+    aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
     if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
     aom_subtract_block = aom_subtract_block_sse2;
     if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
     aom_vector_var = aom_vector_var_c;
     if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
     if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
-    av1_compute_cross_correlation = av1_compute_cross_correlation_c;
-    if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
-    if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
 }
 #endif
 
diff --git a/media/libaom/config/win/x64/config/aom_scale_rtcd.h b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/win/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
 #define RTCD_EXTERN extern
 #endif
 
+#include <stdbool.h>
+
 struct yv12_buffer_config;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
 #define aom_extend_frame_borders aom_extend_frame_borders_c
 
 void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
 void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
 #define aom_yv12_copy_frame aom_yv12_copy_frame_c
 
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_u aom_yv12_copy_u_c
 
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
 #define aom_yv12_copy_v aom_yv12_copy_v_c
 
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
 #define aom_yv12_copy_y aom_yv12_copy_y_c
 
 void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
 void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
 #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
 
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
 #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
 
 void aom_scale_rtcd(void);
diff --git a/media/libaom/config/win/x64/config/av1_rtcd.h b/media/libaom/config/win/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/win/x64/config/av1_rtcd.h
+++ b/media/libaom/config/win/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
 #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
 
 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
 RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
 RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
 
 void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
 RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
 RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
 
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
 
 void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
 
 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
 
 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
 
 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
 RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
     av1_convolve_y_sr = av1_convolve_y_sr_sse2;
     if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
-    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+    av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
     if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
     if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
     av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
     av1_inv_txfm_add = av1_inv_txfm_add_c;
     if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
     if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
-    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+    av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
     if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
     if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
     av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
     av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
     if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
-    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+    cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
-    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
-    if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+    cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
     if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
     if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
-    cdef_filter_16_0 = cdef_filter_16_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+    cdef_filter_16_0 = cdef_filter_16_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
-    cdef_filter_16_1 = cdef_filter_16_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+    cdef_filter_16_1 = cdef_filter_16_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
-    cdef_filter_16_2 = cdef_filter_16_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+    cdef_filter_16_2 = cdef_filter_16_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
-    cdef_filter_16_3 = cdef_filter_16_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+    cdef_filter_16_3 = cdef_filter_16_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
-    cdef_filter_8_0 = cdef_filter_8_0_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+    cdef_filter_8_0 = cdef_filter_8_0_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
-    cdef_filter_8_1 = cdef_filter_8_1_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+    cdef_filter_8_1 = cdef_filter_8_1_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
-    cdef_filter_8_2 = cdef_filter_8_2_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+    cdef_filter_8_2 = cdef_filter_8_2_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
-    cdef_filter_8_3 = cdef_filter_8_3_sse2;
-    if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+    cdef_filter_8_3 = cdef_filter_8_3_c;
     if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
     if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
-    cdef_find_dir = cdef_find_dir_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    cdef_find_dir = cdef_find_dir_c;
     if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
-    cdef_find_dir_dual = cdef_find_dir_dual_sse2;
-    if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+    cdef_find_dir_dual = cdef_find_dir_dual_c;
     if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
     if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
     cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/moz.yaml b/media/libaom/moz.yaml
index b06ee5115a..a37ab1e904 100644
--- a/media/libaom/moz.yaml
+++ b/media/libaom/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 11631186b36e96afce18808ebebb17cc23a010ef (Fri Jan 19 23:29:34 2024 +0000).
+  release: 879d14159441796c92f3bbba7f8965e1bcf320ca (Tue Apr 02 21:57:54 2024 +0000).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 11631186b36e96afce18808ebebb17cc23a010ef
+  revision: 879d14159441796c92f3bbba7f8965e1bcf320ca
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
@@ -54,3 +54,7 @@ vendoring:
     - action: run-script
       script: '{yaml_dir}/generate_sources_mozbuild.sh'
       cwd: '{yaml_dir}'
+
+  patches:
+    - 0001-errno.patch
+    - 0002-mmloadusi64.patch
diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild
index b29ddd5c97..187bf97f8a 100644
--- a/media/libaom/sources.mozbuild
+++ b/media/libaom/sources.mozbuild
@@ -506,6 +506,7 @@ files = {
     '../../third_party/aom/aom_dsp/flow_estimation/ransac.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c',
+    '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c',
     '../../third_party/aom/aom_dsp/fwd_txfm.c',
     '../../third_party/aom/aom_dsp/grain_table.c',
@@ -533,11 +534,8 @@ files = {
     '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
     '../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c',
     '../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c',
@@ -599,7 +597,7 @@ files = {
     '../../third_party/aom/aom_dsp/x86/sad_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/sse_avx2.c',
     '../../third_party/aom/aom_dsp/x86/sse_sse4.c',
-    '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm',
     '../../third_party/aom/aom_dsp/x86/subtract_avx2.c',
     '../../third_party/aom/aom_dsp/x86/subtract_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c',
@@ -658,7 +656,6 @@ files = {
     '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
     '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
     '../../third_party/aom/av1/common/x86/cdef_block_avx2.c',
-    '../../third_party/aom/av1/common/x86/cdef_block_sse2.c',
     '../../third_party/aom/av1/common/x86/cdef_block_sse4.c',
     '../../third_party/aom/av1/common/x86/cdef_block_ssse3.c',
     '../../third_party/aom/av1/common/x86/cfl_avx2.c',
@@ -859,6 +856,7 @@ files = {
     '../../third_party/aom/aom_dsp/flow_estimation/ransac.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c',
+    '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c',
     '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c',
     '../../third_party/aom/aom_dsp/fwd_txfm.c',
     '../../third_party/aom/aom_dsp/grain_table.c',
@@ -886,11 +884,8 @@ files = {
     '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
-    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
     '../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c',
     '../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c',
@@ -955,7 +950,7 @@ files = {
     '../../third_party/aom/aom_dsp/x86/sse_avx2.c',
     '../../third_party/aom/aom_dsp/x86/sse_sse4.c',
     '../../third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm',
-    '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm',
     '../../third_party/aom/aom_dsp/x86/subtract_avx2.c',
     '../../third_party/aom/aom_dsp/x86/subtract_sse2.asm',
     '../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c',
@@ -1014,9 +1009,7 @@ files = {
     '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
     '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
     '../../third_party/aom/av1/common/x86/cdef_block_avx2.c',
-    '../../third_party/aom/av1/common/x86/cdef_block_sse2.c',
     '../../third_party/aom/av1/common/x86/cdef_block_sse4.c',
-    '../../third_party/aom/av1/common/x86/cdef_block_ssse3.c',
     '../../third_party/aom/av1/common/x86/cfl_avx2.c',
     '../../third_party/aom/av1/common/x86/cfl_sse2.c',
     '../../third_party/aom/av1/common/x86/cfl_ssse3.c',
diff --git a/media/libcubeb/0004-audiounit-ios-compile-fixes.patch b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch
new file mode 100644
index 0000000000..465ae0f98a
--- /dev/null
+++ b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch
@@ -0,0 +1,1415 @@
+diff --git a/src/cubeb_audiounit.cpp b/src/cubeb_audiounit.cpp
+--- a/src/cubeb_audiounit.cpp
++++ b/src/cubeb_audiounit.cpp
+@@ -36,16 +36,25 @@
+ #include <vector>
+ 
+ using namespace std;
+ 
+ #if MAC_OS_X_VERSION_MIN_REQUIRED < 101000
+ typedef UInt32 AudioFormatFlags;
+ #endif
+ 
++#if TARGET_OS_IPHONE
++typedef UInt32 AudioDeviceID;
++typedef UInt32 AudioObjectID;
++const UInt32 kAudioObjectUnknown = 0;
++
++#define AudioGetCurrentHostTime mach_absolute_time
++
++#endif
++
+ #define AU_OUT_BUS 0
+ #define AU_IN_BUS 1
+ 
+ const char * DISPATCH_QUEUE_LABEL = "org.mozilla.cubeb";
+ const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice";
+ 
+ #ifdef ALOGV
+ #undef ALOGV
+@@ -60,45 +69,47 @@ const char * PRIVATE_AGGREGATE_DEVICE_NA
+ #undef ALOG
+ #endif
+ #define ALOG(msg, ...)                                                         \
+   dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0),   \
+                  ^{                                                            \
+                    LOG(msg, ##__VA_ARGS__);                                    \
+                  })
+ 
++#if !TARGET_OS_IPHONE
+ /* Testing empirically, some headsets report a minimal latency that is very
+  * low, but this does not work in practice. Lie and say the minimum is 256
+  * frames. */
+ const uint32_t SAFE_MIN_LATENCY_FRAMES = 128;
+ const uint32_t SAFE_MAX_LATENCY_FRAMES = 512;
+ 
+ const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = {
+     kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
+ 
+ const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = {
+     kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
+ 
+ const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = {
+     kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
+ 
+ const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = {
+     kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
+ 
+ const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
+     kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
+ 
+ const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
+     kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput,
+-    kAudioObjectPropertyElementMaster};
++    kAudioObjectPropertyElementMain};
++#endif
+ 
+ typedef uint32_t device_flags_value;
+ 
+ enum device_flags {
+   DEV_UNKNOWN = 0x00,        /* Unknown */
+   DEV_INPUT = 0x01,          /* Record device like mic */
+   DEV_OUTPUT = 0x02,         /* Playback device like speakers */
+   DEV_SYSTEM_DEFAULT = 0x04, /* System default device */
+@@ -109,49 +120,51 @@ enum device_flags {
+ void
+ audiounit_stream_stop_internal(cubeb_stream * stm);
+ static int
+ audiounit_stream_start_internal(cubeb_stream * stm);
+ static void
+ audiounit_close_stream(cubeb_stream * stm);
+ static int
+ audiounit_setup_stream(cubeb_stream * stm);
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_devices_of_type(cubeb_device_type devtype);
+ static UInt32
+ audiounit_get_device_presentation_latency(AudioObjectID devid,
+                                           AudioObjectPropertyScope scope);
+-
+-#if !TARGET_OS_IPHONE
+ static AudioObjectID
+ audiounit_get_default_device_id(cubeb_device_type type);
+ static int
+ audiounit_uninstall_device_changed_callback(cubeb_stream * stm);
+ static int
+ audiounit_uninstall_system_changed_callback(cubeb_stream * stm);
++#endif
++
+ static void
+ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags);
+-#endif
+ 
+ extern cubeb_ops const audiounit_ops;
+ 
+ struct cubeb {
+   cubeb_ops const * ops = &audiounit_ops;
+   owned_critical_section mutex;
+   int active_streams = 0;
+   uint32_t global_latency_frames = 0;
+   cubeb_device_collection_changed_callback input_collection_changed_callback =
+       nullptr;
+   void * input_collection_changed_user_ptr = nullptr;
+   cubeb_device_collection_changed_callback output_collection_changed_callback =
+       nullptr;
+   void * output_collection_changed_user_ptr = nullptr;
++  #if !TARGET_OS_IPHONE
+   // Store list of devices to detect changes
+   vector<AudioObjectID> input_device_array;
+   vector<AudioObjectID> output_device_array;
++  #endif
+   // The queue should be released when it’s no longer needed.
+   dispatch_queue_t serial_queue =
+       dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL);
+   // Current used channel layout
+   atomic<cubeb_channel_layout> layout{CUBEB_LAYOUT_UNDEFINED};
+   uint32_t channels = 0;
+ };
+ 
+@@ -181,29 +194,31 @@ to_string(io_side side)
+   }
+ }
+ 
+ struct device_info {
+   AudioDeviceID id = kAudioObjectUnknown;
+   device_flags_value flags = DEV_UNKNOWN;
+ };
+ 
++#if !TARGET_OS_IPHONE
+ struct property_listener {
+   AudioDeviceID device_id;
+   const AudioObjectPropertyAddress * property_address;
+   AudioObjectPropertyListenerProc callback;
+   cubeb_stream * stream;
+ 
+   property_listener(AudioDeviceID id,
+                     const AudioObjectPropertyAddress * address,
+                     AudioObjectPropertyListenerProc proc, cubeb_stream * stm)
+       : device_id(id), property_address(address), callback(proc), stream(stm)
+   {
+   }
+ };
++#endif
+ 
+ struct cubeb_stream {
+   explicit cubeb_stream(cubeb * context);
+ 
+   /* Note: Must match cubeb_stream layout in cubeb.c. */
+   cubeb * context;
+   void * user_ptr = nullptr;
+   /**/
+@@ -252,32 +267,36 @@ struct cubeb_stream {
+   /* Latency requested by the user. */
+   uint32_t latency_frames = 0;
+   atomic<uint32_t> current_latency_frames{0};
+   atomic<uint32_t> total_output_latency_frames{0};
+   unique_ptr<cubeb_resampler, decltype(&cubeb_resampler_destroy)> resampler;
+   /* This is true if a device change callback is currently running.  */
+   atomic<bool> switching_device{false};
+   atomic<bool> buffer_size_change_state{false};
++  #if !TARGET_OS_IPHONE
+   AudioDeviceID aggregate_device_id =
+       kAudioObjectUnknown; // the aggregate device id
+   AudioObjectID plugin_id =
+       kAudioObjectUnknown; // used to create aggregate device
++  #endif
+   /* Mixer interface */
+   unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer;
+   /* Buffer where remixing/resampling will occur when upmixing is required */
+   /* Only accessed from callback thread */
+   unique_ptr<uint8_t[]> temp_buffer;
+   size_t temp_buffer_size = 0; // size in bytes.
++  #if !TARGET_OS_IPHONE
+   /* Listeners indicating what system events are monitored. */
+   unique_ptr<property_listener> default_input_listener;
+   unique_ptr<property_listener> default_output_listener;
+   unique_ptr<property_listener> input_alive_listener;
+   unique_ptr<property_listener> input_source_listener;
+   unique_ptr<property_listener> output_source_listener;
++  #endif
+ };
+ 
+ bool
+ has_input(cubeb_stream * stm)
+ {
+   return stm->input_stream_params.rate != 0;
+ }
+ 
+@@ -381,24 +400,16 @@ bool
+ is_common_sample_rate(Float64 sample_rate)
+ {
+   /* Some commonly used sample rates and their multiples and divisors. */
+   return sample_rate == 8000 || sample_rate == 16000 || sample_rate == 22050 ||
+          sample_rate == 32000 || sample_rate == 44100 || sample_rate == 48000 ||
+          sample_rate == 88200 || sample_rate == 96000;
+ }
+ 
+-#if TARGET_OS_IPHONE
+-typedef UInt32 AudioDeviceID;
+-typedef UInt32 AudioObjectID;
+-
+-#define AudioGetCurrentHostTime mach_absolute_time
+-
+-#endif
+-
+ uint64_t
+ ConvertHostTimeToNanos(uint64_t host_time)
+ {
+   static struct mach_timebase_info timebase_info;
+   static bool initialized = false;
+   if (!initialized) {
+     mach_timebase_info(&timebase_info);
+     initialized = true;
+@@ -756,23 +767,23 @@ audiounit_init(cubeb ** context, char co
+ }
+ 
+ static char const *
+ audiounit_get_backend_id(cubeb * /* ctx */)
+ {
+   return "audiounit";
+ }
+ 
+-#if !TARGET_OS_IPHONE
+ 
+ static int
+ audiounit_stream_get_volume(cubeb_stream * stm, float * volume);
+ static int
+ audiounit_stream_set_volume(cubeb_stream * stm, float volume);
+ 
++#if !TARGET_OS_IPHONE
+ static int
+ audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
+ {
+   assert(stm);
+ 
+   device_info * info = nullptr;
+   cubeb_device_type type = CUBEB_DEVICE_TYPE_UNKNOWN;
+ 
+@@ -806,42 +817,47 @@ audiounit_set_device_info(cubeb_stream *
+   }
+ 
+   assert(info->id);
+   assert(info->flags & DEV_INPUT && !(info->flags & DEV_OUTPUT) ||
+          !(info->flags & DEV_INPUT) && info->flags & DEV_OUTPUT);
+ 
+   return CUBEB_OK;
+ }
++#endif
+ 
+ static int
+ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
+ {
+   auto_lock context_lock(stm->context->mutex);
+   assert((flags & DEV_INPUT && stm->input_unit) ||
+          (flags & DEV_OUTPUT && stm->output_unit));
+   if (!stm->shutdown) {
+     audiounit_stream_stop_internal(stm);
+   }
+ 
+-  int r = audiounit_uninstall_device_changed_callback(stm);
++  int r;
++#if !TARGET_OS_IPHONE
++  r = audiounit_uninstall_device_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not uninstall all device change listeners.", stm);
+   }
++#endif
+ 
+   {
+     auto_lock lock(stm->mutex);
+     float volume = 0.0;
+     int vol_rv = CUBEB_ERROR;
+     if (stm->output_unit) {
+       vol_rv = audiounit_stream_get_volume(stm, &volume);
+     }
+ 
+     audiounit_close_stream(stm);
+ 
++    #if !TARGET_OS_IPHONE
+     /* Reinit occurs in one of the following case:
+      * - When the device is not alive any more
+      * - When the default system device change.
+      * - The bluetooth device changed from A2DP to/from HFP/HSP profile
+      * We first attempt to re-use the same device id, should that fail we will
+      * default to the (potentially new) default device. */
+     AudioDeviceID input_device =
+         flags & DEV_INPUT ? stm->input_device.id : kAudioObjectUnknown;
+@@ -861,29 +877,33 @@ audiounit_reinit_stream(cubeb_stream * s
+     r = audiounit_set_device_info(stm, kAudioObjectUnknown, io_side::OUTPUT);
+     if (r != CUBEB_OK) {
+       LOG("(%p) Set output device info failed. This can happen when last media "
+           "device is unplugged",
+           stm);
+       return CUBEB_ERROR;
+     }
+ 
++    #endif
++
+     if (audiounit_setup_stream(stm) != CUBEB_OK) {
+       LOG("(%p) Stream reinit failed.", stm);
++      #if !TARGET_OS_IPHONE
+       if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) {
+         // Attempt to re-use the same device-id failed, so attempt again with
+         // default input device.
+         audiounit_close_stream(stm);
+         if (audiounit_set_device_info(stm, kAudioObjectUnknown,
+                                       io_side::INPUT) != CUBEB_OK ||
+             audiounit_setup_stream(stm) != CUBEB_OK) {
+           LOG("(%p) Second stream reinit failed.", stm);
+           return CUBEB_ERROR;
+         }
+       }
++      #endif
+     }
+ 
+     if (vol_rv == CUBEB_OK) {
+       audiounit_stream_set_volume(stm, volume);
+     }
+ 
+     // If the stream was running, start it again.
+     if (!stm->shutdown) {
+@@ -909,27 +929,30 @@ audiounit_reinit_stream_async(cubeb_stre
+   // Get/SetProperties method from inside notify callback
+   dispatch_async(stm->context->serial_queue, ^() {
+     if (stm->destroy_pending) {
+       ALOG("(%p) stream pending destroy, cancelling reinit task", stm);
+       return;
+     }
+ 
+     if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) {
++      #if !TARGET_OS_IPHONE
+       if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) {
+         LOG("(%p) Could not uninstall system changed callback", stm);
+       }
++      #endif
+       stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR);
+       LOG("(%p) Could not reopen the stream after switching.", stm);
+     }
+     stm->switching_device = false;
+     stm->reinit_pending = false;
+   });
+ }
+ 
++#if !TARGET_OS_IPHONE
+ static char const *
+ event_addr_to_string(AudioObjectPropertySelector selector)
+ {
+   switch (selector) {
+   case kAudioHardwarePropertyDefaultOutputDevice:
+     return "kAudioHardwarePropertyDefaultOutputDevice";
+   case kAudioHardwarePropertyDefaultInputDevice:
+     return "kAudioHardwarePropertyDefaultInputDevice";
+@@ -1091,16 +1114,17 @@ audiounit_install_device_changed_callbac
+           rv, stm->input_device.id);
+       r = CUBEB_ERROR;
+     }
+   }
+ 
+   return r;
+ }
+ 
++#if !TARGET_OS_IPHONE
+ static int
+ audiounit_install_system_changed_callback(cubeb_stream * stm)
+ {
+   OSStatus r;
+ 
+   if (stm->output_unit) {
+     /* This event will notify us when the default audio device changes,
+      * for example when the user plugs in a USB headset and the system chooses
+@@ -1131,16 +1155,17 @@ audiounit_install_system_changed_callbac
+           "kAudioHardwarePropertyDefaultInputDevice rv=%d",
+           r);
+       return CUBEB_ERROR;
+     }
+   }
+ 
+   return CUBEB_OK;
+ }
++#endif
+ 
+ static int
+ audiounit_uninstall_device_changed_callback(cubeb_stream * stm)
+ {
+   OSStatus rv;
+   // Failing to uninstall listeners is not a fatal error.
+   int r = CUBEB_OK;
+ 
+@@ -1207,17 +1232,17 @@ audiounit_uninstall_system_changed_callb
+ static int
+ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
+ {
+   UInt32 size;
+   OSStatus r;
+   AudioDeviceID output_device_id;
+   AudioObjectPropertyAddress output_device_buffer_size_range = {
+       kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+ 
+   output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+   if (output_device_id == kAudioObjectUnknown) {
+     LOG("Could not get default output device id.");
+     return CUBEB_ERROR;
+   }
+ 
+   /* Get the buffer size range this device supports */
+@@ -1228,17 +1253,16 @@ audiounit_get_acceptable_latency_range(A
+                                  &size, latency_range);
+   if (r != noErr) {
+     LOG("AudioObjectGetPropertyData/buffer size range rv=%d", r);
+     return CUBEB_ERROR;
+   }
+ 
+   return CUBEB_OK;
+ }
+-#endif /* !TARGET_OS_IPHONE */
+ 
+ static AudioObjectID
+ audiounit_get_default_device_id(cubeb_device_type type)
+ {
+   const AudioObjectPropertyAddress * adr;
+   if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
+     adr = &DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS;
+   } else if (type == CUBEB_DEVICE_TYPE_INPUT) {
+@@ -1251,31 +1275,32 @@ audiounit_get_default_device_id(cubeb_de
+   UInt32 size = sizeof(AudioDeviceID);
+   if (AudioObjectGetPropertyData(kAudioObjectSystemObject, adr, 0, NULL, &size,
+                                  &devid) != noErr) {
+     return kAudioObjectUnknown;
+   }
+ 
+   return devid;
+ }
++#endif /* !TARGET_OS_IPHONE */
+ 
+ int
+ audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
+ {
+ #if TARGET_OS_IPHONE
+   // TODO: [[AVAudioSession sharedInstance] maximumOutputNumberOfChannels]
+   *max_channels = 2;
+ #else
+   UInt32 size;
+   OSStatus r;
+   AudioDeviceID output_device_id;
+   AudioStreamBasicDescription stream_format;
+   AudioObjectPropertyAddress stream_format_address = {
+       kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+ 
+   assert(ctx && max_channels);
+ 
+   output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+   if (output_device_id == kAudioObjectUnknown) {
+     return CUBEB_ERROR;
+   }
+ 
+@@ -1304,52 +1329,52 @@ audiounit_get_min_latency(cubeb * /* ctx
+   AudioValueRange latency_range;
+   if (audiounit_get_acceptable_latency_range(&latency_range) != CUBEB_OK) {
+     LOG("Could not get acceptable latency range.");
+     return CUBEB_ERROR;
+   }
+ 
+   *latency_frames =
+       max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES);
++  return CUBEB_OK;
+ #endif
+-
+-  return CUBEB_OK;
+ }
+ 
+ static int
+ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
+ {
+ #if TARGET_OS_IPHONE
+-  // TODO
+-  return CUBEB_ERROR_NOT_SUPPORTED;
++  *rate = 44100;
++  return CUBEB_OK;
+ #else
+   UInt32 size;
+   OSStatus r;
+   Float64 fsamplerate;
+   AudioDeviceID output_device_id;
+   AudioObjectPropertyAddress samplerate_address = {
+       kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+ 
+   output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+   if (output_device_id == kAudioObjectUnknown) {
+     return CUBEB_ERROR;
+   }
+ 
+   size = sizeof(fsamplerate);
+   r = AudioObjectGetPropertyData(output_device_id, &samplerate_address, 0, NULL,
+                                  &size, &fsamplerate);
+ 
+   if (r != noErr) {
+     return CUBEB_ERROR;
+   }
+ 
+   *rate = static_cast<uint32_t>(fsamplerate);
++
++  return CUBEB_OK;
+ #endif
+-  return CUBEB_OK;
+ }
+ 
+ static cubeb_channel_layout
+ audiounit_convert_channel_layout(AudioChannelLayout * layout)
+ {
+   // When having one or two channel, force mono or stereo. Some devices (namely,
+   // Bose QC35, mark 1 and 2), expose a single channel mapped to the right for
+   // some reason.
+@@ -1380,16 +1405,19 @@ audiounit_convert_channel_layout(AudioCh
+   }
+ 
+   return cl;
+ }
+ 
+ static cubeb_channel_layout
+ audiounit_get_preferred_channel_layout(AudioUnit output_unit)
+ {
++  #if TARGET_OS_IPHONE
++    return CUBEB_LAYOUT_STEREO;
++  #else
+   OSStatus rv = noErr;
+   UInt32 size = 0;
+   rv = AudioUnitGetPropertyInfo(
+       output_unit, kAudioDevicePropertyPreferredChannelLayout,
+       kAudioUnitScope_Output, AU_OUT_BUS, &size, nullptr);
+   if (rv != noErr) {
+     LOG("AudioUnitGetPropertyInfo/kAudioDevicePropertyPreferredChannelLayout "
+         "rv=%d",
+@@ -1404,16 +1432,17 @@ audiounit_get_preferred_channel_layout(A
+       kAudioUnitScope_Output, AU_OUT_BUS, layout.get(), &size);
+   if (rv != noErr) {
+     LOG("AudioUnitGetProperty/kAudioDevicePropertyPreferredChannelLayout rv=%d",
+         rv);
+     return CUBEB_LAYOUT_UNDEFINED;
+   }
+ 
+   return audiounit_convert_channel_layout(layout.get());
++  #endif
+ }
+ 
+ static cubeb_channel_layout
+ audiounit_get_current_channel_layout(AudioUnit output_unit)
+ {
+   OSStatus rv = noErr;
+   UInt32 size = 0;
+   rv = AudioUnitGetPropertyInfo(
+@@ -1437,18 +1466,20 @@ audiounit_get_current_channel_layout(Aud
+   }
+ 
+   return audiounit_convert_channel_layout(layout.get());
+ }
+ 
+ static int
+ audiounit_create_unit(AudioUnit * unit, device_info * device);
+ 
++#if !TARGET_OS_IPHONE
+ static OSStatus
+ audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype);
++#endif
+ 
+ static void
+ audiounit_destroy(cubeb * ctx)
+ {
+   {
+     auto_lock lock(ctx->mutex);
+ 
+     // Disabling this assert for bug 1083664 -- we seem to leak a stream
+@@ -1460,23 +1491,25 @@ audiounit_destroy(cubeb * ctx)
+ 
+     // Destroying a cubeb context with device collection callbacks registered
+     // is misuse of the API, assert then attempt to clean up.
+     assert(!ctx->input_collection_changed_callback &&
+            !ctx->input_collection_changed_user_ptr &&
+            !ctx->output_collection_changed_callback &&
+            !ctx->output_collection_changed_user_ptr);
+ 
++    #if !TARGET_OS_IPHONE
+     /* Unregister the callback if necessary. */
+     if (ctx->input_collection_changed_callback) {
+       audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT);
+     }
+     if (ctx->output_collection_changed_callback) {
+       audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT);
+     }
++    #endif
+   }
+ 
+   dispatch_release(ctx->serial_queue);
+ 
+   delete ctx;
+ }
+ 
+ static void
+@@ -1594,23 +1627,24 @@ audiounit_layout_init(cubeb_stream * stm
+   }
+ 
+   stm->context->layout = audiounit_get_current_channel_layout(stm->output_unit);
+ 
+   audiounit_set_channel_layout(stm->output_unit, io_side::OUTPUT,
+                                stm->context->layout);
+ }
+ 
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_sub_devices(AudioDeviceID device_id)
+ {
+   vector<AudioDeviceID> sub_devices;
+   AudioObjectPropertyAddress property_address = {
+       kAudioAggregateDevicePropertyActiveSubDeviceList,
+-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+   UInt32 size = 0;
+   OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0,
+                                                nullptr, &size);
+ 
+   if (rv != noErr) {
+     sub_devices.push_back(device_id);
+     return sub_devices;
+   }
+@@ -1629,17 +1663,17 @@ audiounit_get_sub_devices(AudioDeviceID 
+ }
+ 
+ static int
+ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
+                                         AudioDeviceID * aggregate_device_id)
+ {
+   AudioObjectPropertyAddress address_plugin_bundle_id = {
+       kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+   UInt32 size = 0;
+   OSStatus r = AudioObjectGetPropertyDataSize(
+       kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size);
+   if (r != noErr) {
+     LOG("AudioObjectGetPropertyDataSize/"
+         "kAudioHardwarePropertyPlugInForBundleID, rv=%d",
+         r);
+     return CUBEB_ERROR;
+@@ -1659,17 +1693,17 @@ audiounit_create_blank_aggregate_device(
+     LOG("AudioObjectGetPropertyData/kAudioHardwarePropertyPlugInForBundleID, "
+         "rv=%d",
+         r);
+     return CUBEB_ERROR;
+   }
+ 
+   AudioObjectPropertyAddress create_aggregate_device_address = {
+       kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+   r = AudioObjectGetPropertyDataSize(
+       *plugin_id, &create_aggregate_device_address, 0, nullptr, &size);
+   if (r != noErr) {
+     LOG("AudioObjectGetPropertyDataSize/kAudioPlugInCreateAggregateDevice, "
+         "rv=%d",
+         r);
+     return CUBEB_ERROR;
+   }
+@@ -1731,17 +1765,17 @@ audiounit_create_blank_aggregate_device(
+ // object is increased.
+ static CFStringRef
+ get_device_name(AudioDeviceID id)
+ {
+   UInt32 size = sizeof(CFStringRef);
+   CFStringRef UIname = nullptr;
+   AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID,
+                                              kAudioObjectPropertyScopeGlobal,
+-                                             kAudioObjectPropertyElementMaster};
++                                             kAudioObjectPropertyElementMain};
+   OSStatus err =
+       AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname);
+   return (err == noErr) ? UIname : NULL;
+ }
+ 
+ static int
+ audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id,
+                                         AudioDeviceID input_device_id,
+@@ -1774,17 +1808,17 @@ audiounit_set_aggregate_sub_device_list(
+       return CUBEB_ERROR;
+     }
+     CFArrayAppendValue(aggregate_sub_devices_array, ref);
+     CFRelease(ref);
+   }
+ 
+   AudioObjectPropertyAddress aggregate_sub_device_list = {
+       kAudioAggregateDevicePropertyFullSubDeviceList,
+-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+   UInt32 size = sizeof(CFMutableArrayRef);
+   OSStatus rv = AudioObjectSetPropertyData(
+       aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size,
+       &aggregate_sub_devices_array);
+   CFRelease(aggregate_sub_devices_array);
+   if (rv != noErr) {
+     LOG("AudioObjectSetPropertyData/"
+         "kAudioAggregateDevicePropertyFullSubDeviceList, rv=%d",
+@@ -1796,17 +1830,17 @@ audiounit_set_aggregate_sub_device_list(
+ }
+ 
+ static int
+ audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id)
+ {
+   assert(aggregate_device_id != kAudioObjectUnknown);
+   AudioObjectPropertyAddress master_aggregate_sub_device = {
+       kAudioAggregateDevicePropertyMasterSubDevice,
+-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+ 
+   // Master become the 1st output sub device
+   AudioDeviceID output_device_id =
+       audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+   const vector<AudioDeviceID> output_sub_devices =
+       audiounit_get_sub_devices(output_device_id);
+   CFStringRef master_sub_device = get_device_name(output_sub_devices[0]);
+ 
+@@ -1829,17 +1863,17 @@ audiounit_set_master_aggregate_device(co
+ 
+ static int
+ audiounit_activate_clock_drift_compensation(
+     const AudioDeviceID aggregate_device_id)
+ {
+   assert(aggregate_device_id != kAudioObjectUnknown);
+   AudioObjectPropertyAddress address_owned = {
+       kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+ 
+   UInt32 qualifier_data_size = sizeof(AudioObjectID);
+   AudioClassID class_id = kAudioSubDeviceClassID;
+   void * qualifier_data = &class_id;
+   UInt32 size = 0;
+   OSStatus rv = AudioObjectGetPropertyDataSize(
+       aggregate_device_id, &address_owned, qualifier_data_size, qualifier_data,
+       &size);
+@@ -1861,17 +1895,17 @@ audiounit_activate_clock_drift_compensat
+   if (rv != noErr) {
+     LOG("AudioObjectGetPropertyData/kAudioObjectPropertyOwnedObjects, rv=%d",
+         rv);
+     return CUBEB_ERROR;
+   }
+ 
+   AudioObjectPropertyAddress address_drift = {
+       kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+ 
+   // Start from the second device since the first is the master clock
+   for (UInt32 i = 1; i < subdevices_num; ++i) {
+     UInt32 drift_compensation_value = 1;
+     rv = AudioObjectSetPropertyData(sub_devices[i], &address_drift, 0, nullptr,
+                                     sizeof(UInt32), &drift_compensation_value);
+     if (rv != noErr) {
+       LOG("AudioObjectSetPropertyData/"
+@@ -1930,17 +1964,17 @@ audiounit_workaround_for_airpod(cubeb_st
+         &output_min_rate, &output_max_rate, &output_nominal_rate);
+     LOG("(%p) Output device %u, name: %s, min: %u, max: %u, nominal rate: %u",
+         stm, stm->output_device.id, output_device_info.friendly_name,
+         output_min_rate, output_max_rate, output_nominal_rate);
+ 
+     Float64 rate = input_nominal_rate;
+     AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate,
+                                        kAudioObjectPropertyScopeGlobal,
+-                                       kAudioObjectPropertyElementMaster};
++                                       kAudioObjectPropertyElementMain};
+ 
+     OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0,
+                                              nullptr, sizeof(Float64), &rate);
+     if (rv != noErr) {
+       LOG("Non fatal error, "
+           "AudioObjectSetPropertyData/kAudioDevicePropertyNominalSampleRate, "
+           "rv=%d",
+           rv);
+@@ -2014,17 +2048,17 @@ audiounit_create_aggregate_device(cubeb_
+ static int
+ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
+                                    AudioDeviceID * aggregate_device_id)
+ {
+   assert(aggregate_device_id && *aggregate_device_id != kAudioDeviceUnknown &&
+          plugin_id != kAudioObjectUnknown);
+   AudioObjectPropertyAddress destroy_aggregate_device_addr = {
+       kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal,
+-      kAudioObjectPropertyElementMaster};
++      kAudioObjectPropertyElementMain};
+   UInt32 size;
+   OSStatus rv = AudioObjectGetPropertyDataSize(
+       plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size);
+   if (rv != noErr) {
+     LOG("AudioObjectGetPropertyDataSize/kAudioPlugInDestroyAggregateDevice, "
+         "rv=%d",
+         rv);
+     return CUBEB_ERROR;
+@@ -2037,16 +2071,17 @@ audiounit_destroy_aggregate_device(Audio
+         rv);
+     return CUBEB_ERROR;
+   }
+ 
+   LOG("Destroyed aggregate device %d", *aggregate_device_id);
+   *aggregate_device_id = kAudioObjectUnknown;
+   return CUBEB_OK;
+ }
++#endif
+ 
+ static int
+ audiounit_new_unit_instance(AudioUnit * unit, device_info * device)
+ {
+   AudioComponentDescription desc;
+   AudioComponent comp;
+   OSStatus rv;
+ 
+@@ -2173,16 +2208,19 @@ audiounit_init_input_linear_buffer(cubeb
+   assert(stream->input_linear_buffer->length() == 0);
+ 
+   return CUBEB_OK;
+ }
+ 
+ static uint32_t
+ audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
+ {
++  #if TARGET_OS_IPHONE
++  return latency_frames;
++  #else
+   // For the 1st stream set anything within safe min-max
+   assert(audiounit_active_streams(stm->context) > 0);
+   if (audiounit_active_streams(stm->context) == 1) {
+     return max(min<uint32_t>(latency_frames, SAFE_MAX_LATENCY_FRAMES),
+                SAFE_MIN_LATENCY_FRAMES);
+   }
+   assert(stm->output_unit);
+ 
+@@ -2233,18 +2271,20 @@ audiounit_clamp_latency(cubeb_stream * s
+   } else if (output_buffer_size != 0) {
+     upper_latency_limit = output_buffer_size;
+   } else {
+     upper_latency_limit = SAFE_MAX_LATENCY_FRAMES;
+   }
+ 
+   return max(min<uint32_t>(latency_frames, upper_latency_limit),
+              SAFE_MIN_LATENCY_FRAMES);
++  #endif
+ }
+ 
++#if !TARGET_OS_IPHONE
+ /*
+  * Change buffer size is prone to deadlock thus we change it
+  * following the steps:
+  * - register a listener for the buffer size property
+  * - change the property
+  * - wait until the listener is executed
+  * - property has changed, remove the listener
+  * */
+@@ -2285,21 +2325,25 @@ buffer_size_changed_callback(void * inCl
+           "= %d for scope %d",
+           stm, au_type, new_buffer_size, inScope);
+     }
+     stm->buffer_size_change_state = true;
+     break;
+   }
+   }
+ }
++#endif
+ 
+ static int
+ audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
+                           io_side side)
+ {
++  #if TARGET_OS_IPHONE
++  return CUBEB_OK;
++  #else
+   AudioUnit au = stm->output_unit;
+   AudioUnitScope au_scope = kAudioUnitScope_Input;
+   AudioUnitElement au_element = AU_OUT_BUS;
+ 
+   if (side == io_side::INPUT) {
+     au = stm->input_unit;
+     au_scope = kAudioUnitScope_Output;
+     au_element = AU_IN_BUS;
+@@ -2377,16 +2421,17 @@ audiounit_set_buffer_size(cubeb_stream *
+   if (!stm->buffer_size_change_state && count >= 30) {
+     LOG("(%p) Error, did not get buffer size change callback ...", stm);
+     return CUBEB_ERROR;
+   }
+ 
+   LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side),
+       new_size_frames);
+   return CUBEB_OK;
++  #endif
+ }
+ 
+ static int
+ audiounit_configure_input(cubeb_stream * stm)
+ {
+   assert(stm && stm->input_unit);
+ 
+   int r = 0;
+@@ -2593,16 +2638,17 @@ audiounit_setup_stream(cubeb_stream * st
+     return CUBEB_ERROR_NOT_SUPPORTED;
+   }
+ 
+   int r = 0;
+ 
+   device_info in_dev_info = stm->input_device;
+   device_info out_dev_info = stm->output_device;
+ 
++  #if !TARGET_OS_IPHONE
+   if (has_input(stm) && has_output(stm) &&
+       stm->input_device.id != stm->output_device.id) {
+     r = audiounit_create_aggregate_device(stm);
+     if (r != CUBEB_OK) {
+       stm->aggregate_device_id = kAudioObjectUnknown;
+       LOG("(%p) Create aggregate devices failed.", stm);
+       // !!!NOTE: It is not necessary to return here. If it does not
+       // return it will fallback to the old implementation. The intention
+@@ -2610,16 +2656,20 @@ audiounit_setup_stream(cubeb_stream * st
+       // it after a couple of weeks.
+       return r;
+     } else {
+       in_dev_info.id = out_dev_info.id = stm->aggregate_device_id;
+       in_dev_info.flags = DEV_INPUT;
+       out_dev_info.flags = DEV_OUTPUT;
+     }
+   }
++  #else
++  in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT;
++  out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT;
++  #endif
+ 
+   if (has_input(stm)) {
+     r = audiounit_create_unit(&stm->input_unit, &in_dev_info);
+     if (r != CUBEB_OK) {
+       LOG("(%p) AudioUnit creation for input failed.", stm);
+       return r;
+     }
+   }
+@@ -2751,18 +2801,20 @@ audiounit_setup_stream(cubeb_stream * st
+ 
+   if (stm->output_unit != NULL) {
+     r = AudioUnitInitialize(stm->output_unit);
+     if (r != noErr) {
+       LOG("AudioUnitInitialize/output rv=%d", r);
+       return CUBEB_ERROR;
+     }
+ 
++    #if !TARGET_OS_IPHONE
+     stm->current_latency_frames = audiounit_get_device_presentation_latency(
+         stm->output_device.id, kAudioDevicePropertyScopeOutput);
++    #endif
+ 
+     Float64 unit_s;
+     UInt32 size = sizeof(unit_s);
+     if (AudioUnitGetProperty(stm->output_unit, kAudioUnitProperty_Latency,
+                              kAudioUnitScope_Global, 0, &unit_s,
+                              &size) == noErr) {
+       stm->current_latency_frames +=
+           static_cast<uint32_t>(unit_s * stm->output_desc.mSampleRate);
+@@ -2772,20 +2824,22 @@ audiounit_setup_stream(cubeb_stream * st
+   if (stm->input_unit && stm->output_unit) {
+     // According to the I/O hardware rate it is expected a specific pattern of
+     // callbacks for example is input is 44100 and output is 48000 we expected
+     // no more than 2 out callback in a row.
+     stm->expected_output_callbacks_in_a_row =
+         ceilf(stm->output_hw_rate / stm->input_hw_rate);
+   }
+ 
++  #if !TARGET_OS_IPHONE
+   r = audiounit_install_device_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not install all device change callback.", stm);
+   }
++  #endif
+ 
+   return CUBEB_OK;
+ }
+ 
+ cubeb_stream::cubeb_stream(cubeb * context)
+     : context(context), resampler(nullptr, cubeb_resampler_destroy),
+       mixer(nullptr, cubeb_mixer_destroy)
+ {
+@@ -2823,51 +2877,57 @@ audiounit_stream_init(cubeb * context, c
+   stm->latency_frames = latency_frames;
+ 
+   if ((input_device && !input_stream_params) ||
+       (output_device && !output_stream_params)) {
+     return CUBEB_ERROR_INVALID_PARAMETER;
+   }
+   if (input_stream_params) {
+     stm->input_stream_params = *input_stream_params;
++  #if !TARGET_OS_IPHONE
+     r = audiounit_set_device_info(
+         stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT);
+     if (r != CUBEB_OK) {
+       LOG("(%p) Fail to set device info for input.", stm.get());
+       return r;
+     }
++  #endif
+   }
+   if (output_stream_params) {
+     stm->output_stream_params = *output_stream_params;
++  #if !TARGET_OS_IPHONE
+     r = audiounit_set_device_info(
+         stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT);
+     if (r != CUBEB_OK) {
+       LOG("(%p) Fail to set device info for output.", stm.get());
+       return r;
+     }
++  #endif
+   }
+ 
+   {
+     // It's not critical to lock here, because no other thread has been started
+     // yet, but it allows to assert that the lock has been taken in
+     // `audiounit_setup_stream`.
+     auto_lock lock(stm->mutex);
+     r = audiounit_setup_stream(stm.get());
+   }
+ 
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not setup the audiounit stream.", stm.get());
+     return r;
+   }
+ 
++  #if !TARGET_OS_IPHONE
+   r = audiounit_install_system_changed_callback(stm.get());
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not install the device change callback.", stm.get());
+     return r;
+   }
++  #endif
+ 
+   *stream = stm.release();
+   LOG("(%p) Cubeb stream init successful.", *stream);
+   return CUBEB_OK;
+ }
+ 
+ static void
+ audiounit_close_stream(cubeb_stream * stm)
+@@ -2886,54 +2946,60 @@ audiounit_close_stream(cubeb_stream * st
+     AudioUnitUninitialize(stm->output_unit);
+     AudioComponentInstanceDispose(stm->output_unit);
+     stm->output_unit = nullptr;
+   }
+ 
+   stm->resampler.reset();
+   stm->mixer.reset();
+ 
++  #if !TARGET_OS_IPHONE
+   if (stm->aggregate_device_id != kAudioObjectUnknown) {
+     audiounit_destroy_aggregate_device(stm->plugin_id,
+                                        &stm->aggregate_device_id);
+     stm->aggregate_device_id = kAudioObjectUnknown;
+   }
++  #endif
+ }
+ 
+ static void
+ audiounit_stream_destroy_internal(cubeb_stream * stm)
+ {
+   stm->context->mutex.assert_current_thread_owns();
+ 
++#if !TARGET_OS_IPHONE
+   int r = audiounit_uninstall_system_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not uninstall the device changed callback", stm);
+   }
+   r = audiounit_uninstall_device_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not uninstall all device change listeners", stm);
+   }
++#endif
+ 
+   auto_lock lock(stm->mutex);
+   audiounit_close_stream(stm);
+   assert(audiounit_active_streams(stm->context) >= 1);
+   audiounit_decrement_active_streams(stm->context);
+ }
+ 
+ static void
+ audiounit_stream_destroy(cubeb_stream * stm)
+ {
++  #if !TARGET_OS_IPHONE
+   int r = audiounit_uninstall_system_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not uninstall the device changed callback", stm);
+   }
+   r = audiounit_uninstall_device_changed_callback(stm);
+   if (r != CUBEB_OK) {
+     LOG("(%p) Could not uninstall all device change listeners", stm);
+   }
++  #endif
+ 
+   if (!stm->shutdown.load()) {
+     auto_lock context_lock(stm->context->mutex);
+     audiounit_stream_stop_internal(stm);
+     stm->shutdown = true;
+   }
+ 
+   stm->destroy_pending = true;
+@@ -3081,16 +3147,17 @@ convert_uint32_into_string(UInt32 data)
+   // Reverse 0xWXYZ into 0xZYXW.
+   str[0] = (char)(data >> 24);
+   str[1] = (char)(data >> 16);
+   str[2] = (char)(data >> 8);
+   str[3] = (char)(data);
+   return str;
+ }
+ 
++#if !TARGET_OS_IPHONE
+ int
+ audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
+ {
+   AudioDeviceID id = audiounit_get_default_device_id(type);
+   if (id == kAudioObjectUnknown) {
+     return CUBEB_ERROR;
+   }
+ 
+@@ -3102,38 +3169,43 @@ audiounit_get_default_device_datasource(
+                                       : &OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS,
+       0, NULL, &size, data);
+   if (r != noErr) {
+     *data = 0;
+   }
+ 
+   return CUBEB_OK;
+ }
++#endif
+ 
+ int
+ audiounit_get_default_device_name(cubeb_stream * stm,
+                                   cubeb_device * const device,
+                                   cubeb_device_type type)
+ {
++#if TARGET_OS_IPHONE
++  return CUBEB_ERROR_NOT_SUPPORTED;
++#else
+   assert(stm);
+   assert(device);
+ 
+   UInt32 data;
+   int r = audiounit_get_default_device_datasource(type, &data);
+   if (r != CUBEB_OK) {
+     return r;
+   }
+   char ** name = type == CUBEB_DEVICE_TYPE_INPUT ? &device->input_name
+                                                  : &device->output_name;
+   *name = convert_uint32_into_string(data).release();
+   if (!strlen(*name)) { // empty string.
+     LOG("(%p) name of %s device is empty!", stm,
+         type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output");
+   }
+   return CUBEB_OK;
++  #endif
+ }
+ 
+ int
+ audiounit_stream_get_current_device(cubeb_stream * stm,
+                                     cubeb_device ** const device)
+ {
+ #if TARGET_OS_IPHONE
+   // TODO
+@@ -3178,16 +3250,17 @@ audiounit_stream_register_device_changed
+   auto_lock dev_cb_lock(stream->device_changed_callback_lock);
+   /* Note: second register without unregister first causes 'nope' error.
+    * Current implementation requires unregister before register a new cb. */
+   assert(!device_changed_callback || !stream->device_changed_callback);
+   stream->device_changed_callback = device_changed_callback;
+   return CUBEB_OK;
+ }
+ 
++#if !TARGET_OS_IPHONE
+ static char *
+ audiounit_strref_to_cstr_utf8(CFStringRef strref)
+ {
+   CFIndex len, size;
+   char * ret;
+   if (strref == NULL) {
+     return NULL;
+   }
+@@ -3199,22 +3272,24 @@ audiounit_strref_to_cstr_utf8(CFStringRe
+ 
+   if (!CFStringGetCString(strref, ret, size, kCFStringEncodingUTF8)) {
+     delete[] ret;
+     ret = NULL;
+   }
+ 
+   return ret;
+ }
+-
++#endif
++
++#if !TARGET_OS_IPHONE
+ static uint32_t
+ audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope)
+ {
+   AudioObjectPropertyAddress adr = {0, scope,
+-                                    kAudioObjectPropertyElementMaster};
++                                    kAudioObjectPropertyElementMain};
+   UInt32 size = 0;
+   uint32_t i, ret = 0;
+ 
+   adr.mSelector = kAudioDevicePropertyStreamConfiguration;
+ 
+   if (AudioObjectGetPropertyDataSize(devid, &adr, 0, NULL, &size) == noErr &&
+       size > 0) {
+     AudioBufferList * list = static_cast<AudioBufferList *>(alloca(size));
+@@ -3230,17 +3305,17 @@ audiounit_get_channel_count(AudioObjectI
+ 
+ static void
+ audiounit_get_available_samplerate(AudioObjectID devid,
+                                    AudioObjectPropertyScope scope,
+                                    uint32_t * min, uint32_t * max,
+                                    uint32_t * def)
+ {
+   AudioObjectPropertyAddress adr = {0, scope,
+-                                    kAudioObjectPropertyElementMaster};
++                                    kAudioObjectPropertyElementMain};
+ 
+   adr.mSelector = kAudioDevicePropertyNominalSampleRate;
+   if (AudioObjectHasProperty(devid, &adr)) {
+     UInt32 size = sizeof(Float64);
+     Float64 fvalue = 0.0;
+     if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &fvalue) ==
+         noErr) {
+       *def = fvalue;
+@@ -3272,17 +3347,17 @@ audiounit_get_available_samplerate(Audio
+   }
+ }
+ 
+ static UInt32
+ audiounit_get_device_presentation_latency(AudioObjectID devid,
+                                           AudioObjectPropertyScope scope)
+ {
+   AudioObjectPropertyAddress adr = {0, scope,
+-                                    kAudioObjectPropertyElementMaster};
++                                    kAudioObjectPropertyElementMain};
+   UInt32 size, dev, stream = 0;
+   AudioStreamID sid[1];
+ 
+   adr.mSelector = kAudioDevicePropertyLatency;
+   size = sizeof(UInt32);
+   if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &dev) != noErr) {
+     dev = 0;
+   }
+@@ -3297,28 +3372,32 @@ audiounit_get_device_presentation_latenc
+ 
+   return dev + stream;
+ }
+ 
+ static int
+ audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
+                                    AudioObjectID devid, cubeb_device_type type)
+ {
+-  AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster};
++  AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain};
+   UInt32 size;
+ 
+   if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
+     adr.mScope = kAudioDevicePropertyScopeOutput;
+   } else if (type == CUBEB_DEVICE_TYPE_INPUT) {
+     adr.mScope = kAudioDevicePropertyScopeInput;
+   } else {
+     return CUBEB_ERROR;
+   }
+ 
++  #if TARGET_OS_IPHONE
++  UINT32 ch = 2;
++  #else
+   UInt32 ch = audiounit_get_channel_count(devid, adr.mScope);
++  #endif
+   if (ch == 0) {
+     return CUBEB_ERROR;
+   }
+ 
+   PodZero(dev_info, 1);
+ 
+   CFStringRef device_id_str = nullptr;
+   size = sizeof(CFStringRef);
+@@ -3412,17 +3491,26 @@ audiounit_create_device_from_hwdev(cubeb
+ 
+ bool
+ is_aggregate_device(cubeb_device_info * device_info)
+ {
+   assert(device_info->friendly_name);
+   return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME,
+                   strlen(PRIVATE_AGGREGATE_DEVICE_NAME));
+ }
+-
++#endif
++
++#if TARGET_OS_IPHONE
++static int
++audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
++                            cubeb_device_collection * collection)
++{
++  return CUBEB_ERROR_NOT_SUPPORTED;
++}
++#else
+ static int
+ audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
+                             cubeb_device_collection * collection)
+ {
+   vector<AudioObjectID> input_devs;
+   vector<AudioObjectID> output_devs;
+ 
+   // Count number of input and output devices.  This is not
+@@ -3478,29 +3566,35 @@ audiounit_enumerate_devices(cubeb * /* c
+ 
+ static void
+ audiounit_device_destroy(cubeb_device_info * device)
+ {
+   delete[] device->device_id;
+   delete[] device->friendly_name;
+   delete[] device->vendor_name;
+ }
++#endif
+ 
+ static int
+ audiounit_device_collection_destroy(cubeb * /* context */,
+                                     cubeb_device_collection * collection)
+ {
++  #if TARGET_OS_IPHONE
++  return CUBEB_ERROR_NOT_SUPPORTED;
++  #else
+   for (size_t i = 0; i < collection->count; i++) {
+     audiounit_device_destroy(&collection->device[i]);
+   }
+   delete[] collection->device;
+ 
+   return CUBEB_OK;
++  #endif
+ }
+ 
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_devices_of_type(cubeb_device_type devtype)
+ {
+   UInt32 size = 0;
+   OSStatus ret = AudioObjectGetPropertyDataSize(
+       kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS, 0, NULL, &size);
+   if (ret != noErr) {
+     return vector<AudioObjectID>();
+@@ -3653,17 +3747,28 @@ audiounit_remove_device_listener(cubeb *
+       context->output_collection_changed_callback) {
+     return noErr;
+   }
+   /* Note: unregister a non registered cb is not a problem, not checking. */
+   return AudioObjectRemovePropertyListener(
+       kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS,
+       audiounit_collection_changed_callback, context);
+ }
+-
++#endif
++
++#if TARGET_OS_IPHONE
++int
++audiounit_register_device_collection_changed(
++    cubeb * context, cubeb_device_type devtype,
++    cubeb_device_collection_changed_callback collection_changed_callback,
++    void * user_ptr)
++{
++  return CUBEB_ERROR_NOT_SUPPORTED;
++}
++#else
+ int
+ audiounit_register_device_collection_changed(
+     cubeb * context, cubeb_device_type devtype,
+     cubeb_device_collection_changed_callback collection_changed_callback,
+     void * user_ptr)
+ {
+   if (devtype == CUBEB_DEVICE_TYPE_UNKNOWN) {
+     return CUBEB_ERROR_INVALID_PARAMETER;
+@@ -3673,16 +3778,17 @@ audiounit_register_device_collection_cha
+   if (collection_changed_callback) {
+     ret = audiounit_add_device_listener(context, devtype,
+                                         collection_changed_callback, user_ptr);
+   } else {
+     ret = audiounit_remove_device_listener(context, devtype);
+   }
+   return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR;
+ }
++#endif
+ 
+ cubeb_ops const audiounit_ops = {
+     /*.init =*/audiounit_init,
+     /*.get_backend_id =*/audiounit_get_backend_id,
+     /*.get_max_channel_count =*/audiounit_get_max_channel_count,
+     /*.get_min_latency =*/audiounit_get_min_latency,
+     /*.get_preferred_sample_rate =*/audiounit_get_preferred_sample_rate,
+     /*.get_supported_input_processing_params =*/NULL,
diff --git a/media/libcubeb/0005-aaudio-timing-fix.patch b/media/libcubeb/0005-aaudio-timing-fix.patch
new file mode 100644
index 0000000000..aabaec9c50
--- /dev/null
+++ b/media/libcubeb/0005-aaudio-timing-fix.patch
@@ -0,0 +1,57 @@
+From 19fcbefe1a9c5e22f8111af251df27b41658bc77 Mon Sep 17 00:00:00 2001
+From: John Lin <jolin@mozilla.com>
+Date: Mon, 29 Apr 2024 13:46:57 -0700
+Subject: [PATCH] Invalidate timing info buffers when destorying AAudio stream.
+
+aaudio_stream_get_position() returns incorrect result because
+aaudio_stream_init() recycled destroyed stream where the
+timing_info buffers contain stale data.
+---
+ src/cubeb_aaudio.cpp        | 2 ++
+ src/cubeb_triple_buffer.h   | 7 +++++++
+ test/test_triple_buffer.cpp | 3 +++
+ 3 files changed, 12 insertions(+)
+
+diff --git a/src/cubeb_aaudio.cpp b/src/cubeb_aaudio.cpp
+index cfae2d6f..8b5eb231 100644
+--- a/src/cubeb_aaudio.cpp
++++ b/src/cubeb_aaudio.cpp
+@@ -1049,6 +1049,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock)
+     stm->istream = nullptr;
+   }
+ 
++  stm->timing_info.invalidate();
++
+   if (stm->resampler) {
+     cubeb_resampler_destroy(stm->resampler);
+     stm->resampler = nullptr;
+diff --git a/src/cubeb_triple_buffer.h b/src/cubeb_triple_buffer.h
+index a5a5978f..759b92e6 100644
+--- a/src/cubeb_triple_buffer.h
++++ b/src/cubeb_triple_buffer.h
+@@ -42,6 +42,13 @@ template <typename T> class triple_buffer {
+   {
+     return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0;
+   }
++  // Reset state and indices to initial values.
++  void invalidate()
++  {
++    shared_state.store(0, std::memory_order_release);
++    input_idx = 1;
++    output_idx = 2;
++  }
+ 
+ private:
+   // Publish a value to the consumer. Returns true if the data was overwritten
+diff --git a/test/test_triple_buffer.cpp b/test/test_triple_buffer.cpp
+index a6e0049b..d463c07e 100644
+--- a/test/test_triple_buffer.cpp
++++ b/test/test_triple_buffer.cpp
+@@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer)
+   }
+ 
+   t.join();
++
++  buffer.invalidate();
++  ASSERT_FALSE(buffer.updated());
+ }
diff --git a/media/libcubeb/moz.yaml b/media/libcubeb/moz.yaml
index d79e64b5eb..3444bdb1d6 100644
--- a/media/libcubeb/moz.yaml
+++ b/media/libcubeb/moz.yaml
@@ -20,6 +20,8 @@ vendoring:
     - 0001-disable-aaudio-before-android-31.patch
     - 0002-disable-crash-reporter-death-test.patch
     - 0003-Only-build-duplex_collection_change_no_unregister-wh.patch
+    - 0004-audiounit-ios-compile-fixes.patch
+    - 0005-aaudio-timing-fix.patch
   skip-vendoring-steps:
     - update-moz-build
   exclude:
diff --git a/media/libcubeb/src/cubeb_aaudio.cpp b/media/libcubeb/src/cubeb_aaudio.cpp
index df19602cd6..c2441bbeef 100644
--- a/media/libcubeb/src/cubeb_aaudio.cpp
+++ b/media/libcubeb/src/cubeb_aaudio.cpp
@@ -1039,6 +1039,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock)
     stm->istream = nullptr;
   }
 
+  stm->timing_info.invalidate();
+
   if (stm->resampler) {
     cubeb_resampler_destroy(stm->resampler);
     stm->resampler = nullptr;
diff --git a/media/libcubeb/src/cubeb_audiounit.cpp b/media/libcubeb/src/cubeb_audiounit.cpp
index d823e80ff8..fb15790159 100644
--- a/media/libcubeb/src/cubeb_audiounit.cpp
+++ b/media/libcubeb/src/cubeb_audiounit.cpp
@@ -41,6 +41,15 @@ using namespace std;
 typedef UInt32 AudioFormatFlags;
 #endif
 
+#if TARGET_OS_IPHONE
+typedef UInt32 AudioDeviceID;
+typedef UInt32 AudioObjectID;
+const UInt32 kAudioObjectUnknown = 0;
+
+#define AudioGetCurrentHostTime mach_absolute_time
+
+#endif
+
 #define AU_OUT_BUS 0
 #define AU_IN_BUS 1
 
@@ -65,6 +74,7 @@ const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice";
                    LOG(msg, ##__VA_ARGS__);                                    \
                  })
 
+#if !TARGET_OS_IPHONE
 /* Testing empirically, some headsets report a minimal latency that is very
  * low, but this does not work in practice. Lie and say the minimum is 256
  * frames. */
@@ -73,27 +83,28 @@ const uint32_t SAFE_MAX_LATENCY_FRAMES = 512;
 
 const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = {
     kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
 
 const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = {
     kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
 
 const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = {
     kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
 
 const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = {
     kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
 
 const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
     kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
 
 const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
     kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput,
-    kAudioObjectPropertyElementMaster};
+    kAudioObjectPropertyElementMain};
+#endif
 
 typedef uint32_t device_flags_value;
 
@@ -114,22 +125,22 @@ static void
 audiounit_close_stream(cubeb_stream * stm);
 static int
 audiounit_setup_stream(cubeb_stream * stm);
+#if !TARGET_OS_IPHONE
 static vector<AudioObjectID>
 audiounit_get_devices_of_type(cubeb_device_type devtype);
 static UInt32
 audiounit_get_device_presentation_latency(AudioObjectID devid,
                                           AudioObjectPropertyScope scope);
-
-#if !TARGET_OS_IPHONE
 static AudioObjectID
 audiounit_get_default_device_id(cubeb_device_type type);
 static int
 audiounit_uninstall_device_changed_callback(cubeb_stream * stm);
 static int
 audiounit_uninstall_system_changed_callback(cubeb_stream * stm);
+#endif
+
 static void
 audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags);
-#endif
 
 extern cubeb_ops const audiounit_ops;
 
@@ -144,9 +155,11 @@ struct cubeb {
   cubeb_device_collection_changed_callback output_collection_changed_callback =
       nullptr;
   void * output_collection_changed_user_ptr = nullptr;
+  #if !TARGET_OS_IPHONE
   // Store list of devices to detect changes
   vector<AudioObjectID> input_device_array;
   vector<AudioObjectID> output_device_array;
+  #endif
   // The queue should be released when it’s no longer needed.
   dispatch_queue_t serial_queue =
       dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL);
@@ -186,6 +199,7 @@ struct device_info {
   device_flags_value flags = DEV_UNKNOWN;
 };
 
+#if !TARGET_OS_IPHONE
 struct property_listener {
   AudioDeviceID device_id;
   const AudioObjectPropertyAddress * property_address;
@@ -199,6 +213,7 @@ struct property_listener {
   {
   }
 };
+#endif
 
 struct cubeb_stream {
   explicit cubeb_stream(cubeb * context);
@@ -257,22 +272,26 @@ struct cubeb_stream {
   /* This is true if a device change callback is currently running.  */
   atomic<bool> switching_device{false};
   atomic<bool> buffer_size_change_state{false};
+  #if !TARGET_OS_IPHONE
   AudioDeviceID aggregate_device_id =
       kAudioObjectUnknown; // the aggregate device id
   AudioObjectID plugin_id =
       kAudioObjectUnknown; // used to create aggregate device
+  #endif
   /* Mixer interface */
   unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer;
   /* Buffer where remixing/resampling will occur when upmixing is required */
   /* Only accessed from callback thread */
   unique_ptr<uint8_t[]> temp_buffer;
   size_t temp_buffer_size = 0; // size in bytes.
+  #if !TARGET_OS_IPHONE
   /* Listeners indicating what system events are monitored. */
   unique_ptr<property_listener> default_input_listener;
   unique_ptr<property_listener> default_output_listener;
   unique_ptr<property_listener> input_alive_listener;
   unique_ptr<property_listener> input_source_listener;
   unique_ptr<property_listener> output_source_listener;
+  #endif
 };
 
 bool
@@ -386,14 +405,6 @@ is_common_sample_rate(Float64 sample_rate)
          sample_rate == 88200 || sample_rate == 96000;
 }
 
-#if TARGET_OS_IPHONE
-typedef UInt32 AudioDeviceID;
-typedef UInt32 AudioObjectID;
-
-#define AudioGetCurrentHostTime mach_absolute_time
-
-#endif
-
 uint64_t
 ConvertHostTimeToNanos(uint64_t host_time)
 {
@@ -761,13 +772,13 @@ audiounit_get_backend_id(cubeb * /* ctx */)
   return "audiounit";
 }
 
-#if !TARGET_OS_IPHONE
 
 static int
 audiounit_stream_get_volume(cubeb_stream * stm, float * volume);
 static int
 audiounit_stream_set_volume(cubeb_stream * stm, float volume);
 
+#if !TARGET_OS_IPHONE
 static int
 audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
 {
@@ -811,6 +822,7 @@ audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
 
   return CUBEB_OK;
 }
+#endif
 
 static int
 audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
@@ -822,10 +834,13 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
     audiounit_stream_stop_internal(stm);
   }
 
-  int r = audiounit_uninstall_device_changed_callback(stm);
+  int r;
+#if !TARGET_OS_IPHONE
+  r = audiounit_uninstall_device_changed_callback(stm);
   if (r != CUBEB_OK) {
     LOG("(%p) Could not uninstall all device change listeners.", stm);
   }
+#endif
 
   {
     auto_lock lock(stm->mutex);
@@ -837,6 +852,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
 
     audiounit_close_stream(stm);
 
+    #if !TARGET_OS_IPHONE
     /* Reinit occurs in one of the following case:
      * - When the device is not alive any more
      * - When the default system device change.
@@ -866,8 +882,11 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
       return CUBEB_ERROR;
     }
 
+    #endif
+
     if (audiounit_setup_stream(stm) != CUBEB_OK) {
       LOG("(%p) Stream reinit failed.", stm);
+      #if !TARGET_OS_IPHONE
       if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) {
         // Attempt to re-use the same device-id failed, so attempt again with
         // default input device.
@@ -879,6 +898,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
           return CUBEB_ERROR;
         }
       }
+      #endif
     }
 
     if (vol_rv == CUBEB_OK) {
@@ -914,9 +934,11 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags)
     }
 
     if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) {
+      #if !TARGET_OS_IPHONE
       if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) {
         LOG("(%p) Could not uninstall system changed callback", stm);
       }
+      #endif
       stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR);
       LOG("(%p) Could not reopen the stream after switching.", stm);
     }
@@ -925,6 +947,7 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags)
   });
 }
 
+#if !TARGET_OS_IPHONE
 static char const *
 event_addr_to_string(AudioObjectPropertySelector selector)
 {
@@ -1096,6 +1119,7 @@ audiounit_install_device_changed_callback(cubeb_stream * stm)
   return r;
 }
 
+#if !TARGET_OS_IPHONE
 static int
 audiounit_install_system_changed_callback(cubeb_stream * stm)
 {
@@ -1136,6 +1160,7 @@ audiounit_install_system_changed_callback(cubeb_stream * stm)
 
   return CUBEB_OK;
 }
+#endif
 
 static int
 audiounit_uninstall_device_changed_callback(cubeb_stream * stm)
@@ -1212,7 +1237,7 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
   AudioDeviceID output_device_id;
   AudioObjectPropertyAddress output_device_buffer_size_range = {
       kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
 
   output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
   if (output_device_id == kAudioObjectUnknown) {
@@ -1233,7 +1258,6 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
 
   return CUBEB_OK;
 }
-#endif /* !TARGET_OS_IPHONE */
 
 static AudioObjectID
 audiounit_get_default_device_id(cubeb_device_type type)
@@ -1256,6 +1280,7 @@ audiounit_get_default_device_id(cubeb_device_type type)
 
   return devid;
 }
+#endif /* !TARGET_OS_IPHONE */
 
 int
 audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
@@ -1270,7 +1295,7 @@ audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
   AudioStreamBasicDescription stream_format;
   AudioObjectPropertyAddress stream_format_address = {
       kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
 
   assert(ctx && max_channels);
 
@@ -1309,17 +1334,16 @@ audiounit_get_min_latency(cubeb * /* ctx */, cubeb_stream_params /* params */,
 
   *latency_frames =
       max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES);
-#endif
-
   return CUBEB_OK;
+#endif
 }
 
 static int
 audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
 {
 #if TARGET_OS_IPHONE
-  // TODO
-  return CUBEB_ERROR_NOT_SUPPORTED;
+  *rate = 44100;
+  return CUBEB_OK;
 #else
   UInt32 size;
   OSStatus r;
@@ -1327,7 +1351,7 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
   AudioDeviceID output_device_id;
   AudioObjectPropertyAddress samplerate_address = {
       kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
 
   output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
   if (output_device_id == kAudioObjectUnknown) {
@@ -1343,8 +1367,9 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
   }
 
   *rate = static_cast<uint32_t>(fsamplerate);
-#endif
+
   return CUBEB_OK;
+#endif
 }
 
 static cubeb_channel_layout
@@ -1385,6 +1410,9 @@ audiounit_convert_channel_layout(AudioChannelLayout * layout)
 static cubeb_channel_layout
 audiounit_get_preferred_channel_layout(AudioUnit output_unit)
 {
+  #if TARGET_OS_IPHONE
+    return CUBEB_LAYOUT_STEREO;
+  #else
   OSStatus rv = noErr;
   UInt32 size = 0;
   rv = AudioUnitGetPropertyInfo(
@@ -1409,6 +1437,7 @@ audiounit_get_preferred_channel_layout(AudioUnit output_unit)
   }
 
   return audiounit_convert_channel_layout(layout.get());
+  #endif
 }
 
 static cubeb_channel_layout
@@ -1442,8 +1471,10 @@ audiounit_get_current_channel_layout(AudioUnit output_unit)
 static int
 audiounit_create_unit(AudioUnit * unit, device_info * device);
 
+#if !TARGET_OS_IPHONE
 static OSStatus
 audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype);
+#endif
 
 static void
 audiounit_destroy(cubeb * ctx)
@@ -1465,6 +1496,7 @@ audiounit_destroy(cubeb * ctx)
            !ctx->output_collection_changed_callback &&
            !ctx->output_collection_changed_user_ptr);
 
+    #if !TARGET_OS_IPHONE
     /* Unregister the callback if necessary. */
     if (ctx->input_collection_changed_callback) {
       audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT);
@@ -1472,6 +1504,7 @@ audiounit_destroy(cubeb * ctx)
     if (ctx->output_collection_changed_callback) {
       audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT);
     }
+    #endif
   }
 
   dispatch_release(ctx->serial_queue);
@@ -1599,13 +1632,14 @@ audiounit_layout_init(cubeb_stream * stm, io_side side)
                                stm->context->layout);
 }
 
+#if !TARGET_OS_IPHONE
 static vector<AudioObjectID>
 audiounit_get_sub_devices(AudioDeviceID device_id)
 {
   vector<AudioDeviceID> sub_devices;
   AudioObjectPropertyAddress property_address = {
       kAudioAggregateDevicePropertyActiveSubDeviceList,
-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
   UInt32 size = 0;
   OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0,
                                                nullptr, &size);
@@ -1634,7 +1668,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
 {
   AudioObjectPropertyAddress address_plugin_bundle_id = {
       kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
   UInt32 size = 0;
   OSStatus r = AudioObjectGetPropertyDataSize(
       kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size);
@@ -1664,7 +1698,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
 
   AudioObjectPropertyAddress create_aggregate_device_address = {
       kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
   r = AudioObjectGetPropertyDataSize(
       *plugin_id, &create_aggregate_device_address, 0, nullptr, &size);
   if (r != noErr) {
@@ -1736,7 +1770,7 @@ get_device_name(AudioDeviceID id)
   CFStringRef UIname = nullptr;
   AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID,
                                              kAudioObjectPropertyScopeGlobal,
-                                             kAudioObjectPropertyElementMaster};
+                                             kAudioObjectPropertyElementMain};
   OSStatus err =
       AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname);
   return (err == noErr) ? UIname : NULL;
@@ -1779,7 +1813,7 @@ audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id,
 
   AudioObjectPropertyAddress aggregate_sub_device_list = {
       kAudioAggregateDevicePropertyFullSubDeviceList,
-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
   UInt32 size = sizeof(CFMutableArrayRef);
   OSStatus rv = AudioObjectSetPropertyData(
       aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size,
@@ -1801,7 +1835,7 @@ audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id)
   assert(aggregate_device_id != kAudioObjectUnknown);
   AudioObjectPropertyAddress master_aggregate_sub_device = {
       kAudioAggregateDevicePropertyMasterSubDevice,
-      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
 
   // Master become the 1st output sub device
   AudioDeviceID output_device_id =
@@ -1834,7 +1868,7 @@ audiounit_activate_clock_drift_compensation(
   assert(aggregate_device_id != kAudioObjectUnknown);
   AudioObjectPropertyAddress address_owned = {
       kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
 
   UInt32 qualifier_data_size = sizeof(AudioObjectID);
   AudioClassID class_id = kAudioSubDeviceClassID;
@@ -1866,7 +1900,7 @@ audiounit_activate_clock_drift_compensation(
 
   AudioObjectPropertyAddress address_drift = {
       kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
 
   // Start from the second device since the first is the master clock
   for (UInt32 i = 1; i < subdevices_num; ++i) {
@@ -1935,7 +1969,7 @@ audiounit_workaround_for_airpod(cubeb_stream * stm)
     Float64 rate = input_nominal_rate;
     AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate,
                                        kAudioObjectPropertyScopeGlobal,
-                                       kAudioObjectPropertyElementMaster};
+                                       kAudioObjectPropertyElementMain};
 
     OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0,
                                              nullptr, sizeof(Float64), &rate);
@@ -2019,7 +2053,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
          plugin_id != kAudioObjectUnknown);
   AudioObjectPropertyAddress destroy_aggregate_device_addr = {
       kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal,
-      kAudioObjectPropertyElementMaster};
+      kAudioObjectPropertyElementMain};
   UInt32 size;
   OSStatus rv = AudioObjectGetPropertyDataSize(
       plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size);
@@ -2042,6 +2076,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
   *aggregate_device_id = kAudioObjectUnknown;
   return CUBEB_OK;
 }
+#endif
 
 static int
 audiounit_new_unit_instance(AudioUnit * unit, device_info * device)
@@ -2178,6 +2213,9 @@ audiounit_init_input_linear_buffer(cubeb_stream * stream, uint32_t capacity)
 static uint32_t
 audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
 {
+  #if TARGET_OS_IPHONE
+  return latency_frames;
+  #else
   // For the 1st stream set anything within safe min-max
   assert(audiounit_active_streams(stm->context) > 0);
   if (audiounit_active_streams(stm->context) == 1) {
@@ -2238,8 +2276,10 @@ audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
 
   return max(min<uint32_t>(latency_frames, upper_latency_limit),
              SAFE_MIN_LATENCY_FRAMES);
+  #endif
 }
 
+#if !TARGET_OS_IPHONE
 /*
  * Change buffer size is prone to deadlock thus we change it
  * following the steps:
@@ -2290,11 +2330,15 @@ buffer_size_changed_callback(void * inClientData, AudioUnit inUnit,
   }
   }
 }
+#endif
 
 static int
 audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
                           io_side side)
 {
+  #if TARGET_OS_IPHONE
+  return CUBEB_OK;
+  #else
   AudioUnit au = stm->output_unit;
   AudioUnitScope au_scope = kAudioUnitScope_Input;
   AudioUnitElement au_element = AU_OUT_BUS;
@@ -2382,6 +2426,7 @@ audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
   LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side),
       new_size_frames);
   return CUBEB_OK;
+  #endif
 }
 
 static int
@@ -2598,6 +2643,7 @@ audiounit_setup_stream(cubeb_stream * stm)
   device_info in_dev_info = stm->input_device;
   device_info out_dev_info = stm->output_device;
 
+  #if !TARGET_OS_IPHONE
   if (has_input(stm) && has_output(stm) &&
       stm->input_device.id != stm->output_device.id) {
     r = audiounit_create_aggregate_device(stm);
@@ -2615,6 +2661,10 @@ audiounit_setup_stream(cubeb_stream * stm)
       out_dev_info.flags = DEV_OUTPUT;
     }
   }
+  #else
+  in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT;
+  out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT;
+  #endif
 
   if (has_input(stm)) {
     r = audiounit_create_unit(&stm->input_unit, &in_dev_info);
@@ -2756,8 +2806,10 @@ audiounit_setup_stream(cubeb_stream * stm)
       return CUBEB_ERROR;
     }
 
+    #if !TARGET_OS_IPHONE
     stm->current_latency_frames = audiounit_get_device_presentation_latency(
         stm->output_device.id, kAudioDevicePropertyScopeOutput);
+    #endif
 
     Float64 unit_s;
     UInt32 size = sizeof(unit_s);
@@ -2777,10 +2829,12 @@ audiounit_setup_stream(cubeb_stream * stm)
         ceilf(stm->output_hw_rate / stm->input_hw_rate);
   }
 
+  #if !TARGET_OS_IPHONE
   r = audiounit_install_device_changed_callback(stm);
   if (r != CUBEB_OK) {
     LOG("(%p) Could not install all device change callback.", stm);
   }
+  #endif
 
   return CUBEB_OK;
 }
@@ -2828,21 +2882,25 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream,
   }
   if (input_stream_params) {
     stm->input_stream_params = *input_stream_params;
+  #if !TARGET_OS_IPHONE
     r = audiounit_set_device_info(
         stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT);
     if (r != CUBEB_OK) {
       LOG("(%p) Fail to set device info for input.", stm.get());
       return r;
     }
+  #endif
   }
   if (output_stream_params) {
     stm->output_stream_params = *output_stream_params;
+  #if !TARGET_OS_IPHONE
     r = audiounit_set_device_info(
         stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT);
     if (r != CUBEB_OK) {
       LOG("(%p) Fail to set device info for output.", stm.get());
       return r;
     }
+  #endif
   }
 
   {
@@ -2858,11 +2916,13 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream,
     return r;
   }
 
+  #if !TARGET_OS_IPHONE
   r = audiounit_install_system_changed_callback(stm.get());
   if (r != CUBEB_OK) {
     LOG("(%p) Could not install the device change callback.", stm.get());
     return r;
   }
+  #endif
 
   *stream = stm.release();
   LOG("(%p) Cubeb stream init successful.", *stream);
@@ -2891,11 +2951,13 @@ audiounit_close_stream(cubeb_stream * stm)
   stm->resampler.reset();
   stm->mixer.reset();
 
+  #if !TARGET_OS_IPHONE
   if (stm->aggregate_device_id != kAudioObjectUnknown) {
     audiounit_destroy_aggregate_device(stm->plugin_id,
                                        &stm->aggregate_device_id);
     stm->aggregate_device_id = kAudioObjectUnknown;
   }
+  #endif
 }
 
 static void
@@ -2903,6 +2965,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
 {
   stm->context->mutex.assert_current_thread_owns();
 
+#if !TARGET_OS_IPHONE
   int r = audiounit_uninstall_system_changed_callback(stm);
   if (r != CUBEB_OK) {
     LOG("(%p) Could not uninstall the device changed callback", stm);
@@ -2911,6 +2974,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
   if (r != CUBEB_OK) {
     LOG("(%p) Could not uninstall all device change listeners", stm);
   }
+#endif
 
   auto_lock lock(stm->mutex);
   audiounit_close_stream(stm);
@@ -2921,6 +2985,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
 static void
 audiounit_stream_destroy(cubeb_stream * stm)
 {
+  #if !TARGET_OS_IPHONE
   int r = audiounit_uninstall_system_changed_callback(stm);
   if (r != CUBEB_OK) {
     LOG("(%p) Could not uninstall the device changed callback", stm);
@@ -2929,6 +2994,7 @@ audiounit_stream_destroy(cubeb_stream * stm)
   if (r != CUBEB_OK) {
     LOG("(%p) Could not uninstall all device change listeners", stm);
   }
+  #endif
 
   if (!stm->shutdown.load()) {
     auto_lock context_lock(stm->context->mutex);
@@ -3086,6 +3152,7 @@ convert_uint32_into_string(UInt32 data)
   return str;
 }
 
+#if !TARGET_OS_IPHONE
 int
 audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
 {
@@ -3107,12 +3174,16 @@ audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
 
   return CUBEB_OK;
 }
+#endif
 
 int
 audiounit_get_default_device_name(cubeb_stream * stm,
                                   cubeb_device * const device,
                                   cubeb_device_type type)
 {
+#if TARGET_OS_IPHONE
+  return CUBEB_ERROR_NOT_SUPPORTED;
+#else
   assert(stm);
   assert(device);
 
@@ -3129,6 +3200,7 @@ audiounit_get_default_device_name(cubeb_stream * stm,
         type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output");
   }
   return CUBEB_OK;
+  #endif
 }
 
 int
@@ -3183,6 +3255,7 @@ audiounit_stream_register_device_changed_callback(
   return CUBEB_OK;
 }
 
+#if !TARGET_OS_IPHONE
 static char *
 audiounit_strref_to_cstr_utf8(CFStringRef strref)
 {
@@ -3204,12 +3277,14 @@ audiounit_strref_to_cstr_utf8(CFStringRef strref)
 
   return ret;
 }
+#endif
 
+#if !TARGET_OS_IPHONE
 static uint32_t
 audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope)
 {
   AudioObjectPropertyAddress adr = {0, scope,
-                                    kAudioObjectPropertyElementMaster};
+                                    kAudioObjectPropertyElementMain};
   UInt32 size = 0;
   uint32_t i, ret = 0;
 
@@ -3235,7 +3310,7 @@ audiounit_get_available_samplerate(AudioObjectID devid,
                                    uint32_t * def)
 {
   AudioObjectPropertyAddress adr = {0, scope,
-                                    kAudioObjectPropertyElementMaster};
+                                    kAudioObjectPropertyElementMain};
 
   adr.mSelector = kAudioDevicePropertyNominalSampleRate;
   if (AudioObjectHasProperty(devid, &adr)) {
@@ -3277,7 +3352,7 @@ audiounit_get_device_presentation_latency(AudioObjectID devid,
                                           AudioObjectPropertyScope scope)
 {
   AudioObjectPropertyAddress adr = {0, scope,
-                                    kAudioObjectPropertyElementMaster};
+                                    kAudioObjectPropertyElementMain};
   UInt32 size, dev, stream = 0;
   AudioStreamID sid[1];
 
@@ -3302,7 +3377,7 @@ static int
 audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
                                    AudioObjectID devid, cubeb_device_type type)
 {
-  AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster};
+  AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain};
   UInt32 size;
 
   if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
@@ -3313,7 +3388,11 @@ audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
     return CUBEB_ERROR;
   }
 
+  #if TARGET_OS_IPHONE
+  UINT32 ch = 2;
+  #else
   UInt32 ch = audiounit_get_channel_count(devid, adr.mScope);
+  #endif
   if (ch == 0) {
     return CUBEB_ERROR;
   }
@@ -3417,7 +3496,16 @@ is_aggregate_device(cubeb_device_info * device_info)
   return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME,
                   strlen(PRIVATE_AGGREGATE_DEVICE_NAME));
 }
+#endif
 
+#if TARGET_OS_IPHONE
+static int
+audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
+                            cubeb_device_collection * collection)
+{
+  return CUBEB_ERROR_NOT_SUPPORTED;
+}
+#else
 static int
 audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
                             cubeb_device_collection * collection)
@@ -3483,19 +3571,25 @@ audiounit_device_destroy(cubeb_device_info * device)
   delete[] device->friendly_name;
   delete[] device->vendor_name;
 }
+#endif
 
 static int
 audiounit_device_collection_destroy(cubeb * /* context */,
                                     cubeb_device_collection * collection)
 {
+  #if TARGET_OS_IPHONE
+  return CUBEB_ERROR_NOT_SUPPORTED;
+  #else
   for (size_t i = 0; i < collection->count; i++) {
     audiounit_device_destroy(&collection->device[i]);
   }
   delete[] collection->device;
 
   return CUBEB_OK;
+  #endif
 }
 
+#if !TARGET_OS_IPHONE
 static vector<AudioObjectID>
 audiounit_get_devices_of_type(cubeb_device_type devtype)
 {
@@ -3658,7 +3752,18 @@ audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype)
       kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS,
       audiounit_collection_changed_callback, context);
 }
+#endif
 
+#if TARGET_OS_IPHONE
+int
+audiounit_register_device_collection_changed(
+    cubeb * context, cubeb_device_type devtype,
+    cubeb_device_collection_changed_callback collection_changed_callback,
+    void * user_ptr)
+{
+  return CUBEB_ERROR_NOT_SUPPORTED;
+}
+#else
 int
 audiounit_register_device_collection_changed(
     cubeb * context, cubeb_device_type devtype,
@@ -3678,6 +3783,7 @@ audiounit_register_device_collection_changed(
   }
   return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR;
 }
+#endif
 
 cubeb_ops const audiounit_ops = {
     /*.init =*/audiounit_init,
diff --git a/media/libcubeb/src/cubeb_triple_buffer.h b/media/libcubeb/src/cubeb_triple_buffer.h
index a5a5978fb4..759b92e62b 100644
--- a/media/libcubeb/src/cubeb_triple_buffer.h
+++ b/media/libcubeb/src/cubeb_triple_buffer.h
@@ -42,6 +42,13 @@ public:
   {
     return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0;
   }
+  // Reset state and indices to initial values.
+  void invalidate()
+  {
+    shared_state.store(0, std::memory_order_release);
+    input_idx = 1;
+    output_idx = 2;
+  }
 
 private:
   // Publish a value to the consumer. Returns true if the data was overwritten
diff --git a/media/libcubeb/src/moz.build b/media/libcubeb/src/moz.build
index d7d05b5867..46a89c4063 100644
--- a/media/libcubeb/src/moz.build
+++ b/media/libcubeb/src/moz.build
@@ -74,8 +74,8 @@ if CONFIG['MOZ_AUDIOUNIT_RUST']:
         SOURCES += [
             'cubeb_osx_run_loop.c',
         ]
+        DEFINES['USE_AUDIOUNIT_RUST'] = True
     DEFINES['USE_AUDIOUNIT'] = True
-    DEFINES['USE_AUDIOUNIT_RUST'] = True
 
 if CONFIG['MOZ_WASAPI']:
     SOURCES += [
diff --git a/media/libcubeb/test/test_triple_buffer.cpp b/media/libcubeb/test/test_triple_buffer.cpp
index a6e0049b79..d463c07e03 100644
--- a/media/libcubeb/test/test_triple_buffer.cpp
+++ b/media/libcubeb/test/test_triple_buffer.cpp
@@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer)
   }
 
   t.join();
+
+  buffer.invalidate();
+  ASSERT_FALSE(buffer.updated());
 }
diff --git a/media/libdav1d/config.h b/media/libdav1d/config.h
index 218c8ae7f4..c7bdc7defc 100644
--- a/media/libdav1d/config.h
+++ b/media/libdav1d/config.h
@@ -46,7 +46,10 @@
 // Those values are copied from the auto generated
 // config file produced by stand alone dav1d build.
 #  define HAVE_AS_FUNC 0
+// Build with <sys/auxv.h> header only on Linux-specific systems.
+#if defined(__linux__)
 #  define HAVE_GETAUXVAL 1
+#endif
 #  define PIC 3
 #endif
 
diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml
index 22994fc7bf..ca526ea688 100644
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 4796b59fc0a459588183dc2ea199ba1074befc67 (2024-02-18T15:37:04.000+01:00).
+  release: 8e08426468a76d8a667e8a79d92bafd85d7411ac (2024-03-18T20:50:37.000+00:00).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 4796b59fc0a459588183dc2ea199ba1074befc67
+  revision: 8e08426468a76d8a667e8a79d92bafd85d7411ac
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h
index 1ac3f3ded3..af1770d5bd 100644
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "4796b59fc0a459588183dc2ea199ba1074befc67"
+#define DAV1D_VERSION "8e08426468a76d8a667e8a79d92bafd85d7411ac"
diff --git a/media/libjxl/moz.yaml b/media/libjxl/moz.yaml
index 7b8d187ff4..ddf34a3dc9 100644
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@@ -10,9 +10,9 @@ origin:
 
   url: https://github.com/libjxl/libjxl
 
-  release: f06a34c77b1bd11bafbe82989241e68c756ccca2 (2024-03-11T15:14:53Z).
+  release: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0 (2024-04-01T20:09:39Z).
 
-  revision: f06a34c77b1bd11bafbe82989241e68c756ccca2
+  revision: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0
 
   license: Apache-2.0
 
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index 06a53435b8..6785121ac9 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -96,7 +96,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
 /* Linux based */
 #include <stdio.h>
 
-opus_uint32 opus_cpu_capabilities(void)
+static opus_uint32 opus_cpu_capabilities(void)
 {
   opus_uint32 flags = 0;
   FILE *cpuinfo;
@@ -169,7 +169,7 @@ opus_uint32 opus_cpu_capabilities(void)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 
-opus_uint32 opus_cpu_capabilities(void)
+static opus_uint32 opus_cpu_capabilities(void)
 {
   opus_uint32 flags = 0;
 
@@ -191,6 +191,54 @@ opus_uint32 opus_cpu_capabilities(void)
   return flags;
 }
 
+#elif defined(__FreeBSD__)
+#include <sys/auxv.h>
+
+static opus_uint32 opus_cpu_capabilities(void)
+{
+  long hwcap = 0;
+  opus_uint32 flags = 0;
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+  /* FreeBSD requires armv6+, which always supports media instructions */
+  flags |= OPUS_CPU_ARM_MEDIA_FLAG;
+# endif
+
+  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
+
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  ifdef HWCAP_EDSP
+  if (hwcap & HWCAP_EDSP)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#   ifdef HWCAP_NEON
+  if (hwcap & HWCAP_NEON)
+    flags |= OPUS_CPU_ARM_NEON_FLAG;
+#   elif defined(HWCAP_ASIMD)
+  if (hwcap & HWCAP_ASIMD)
+    flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#   endif
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP)
+  if (hwcap & HWCAP_ASIMDDP)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+#  endif
+# endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
+  return (flags);
+}
+
 #else
 /* The feature registers which can tell us what the processor supports are
  * accessible in priveleged modes only, so we can't have a general user-space
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 8ae9be8d8f..1e5b6a4cb3 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -68,8 +68,22 @@ int opus_select_arch(void);
   Use this to work around those restrictions (which should hopefully all get
    optimized to a single MOVD instruction).
   GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
-  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
-#  if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754
+  LLVM implemented _mm_loadu_si32() since Clang 8.0, however the
+   __clang_major__ version number macro is unreliable, as vendors
+   (specifically, Apple) will use different numbering schemes than upstream.
+  Clang's advice is "use feature detection", but they do not provide feature
+   detection support for specific SIMD functions.
+  We follow the approach from the SIMDe project and instead detect unrelated
+   features that should be available in the version we want (see
+   <https://github.com/simd-everywhere/simde/blob/master/simde/simde-detect-clang.h>).*/
+#  if defined(__clang__)
+#   if __has_warning("-Wextra-semi-stmt") || \
+ __has_builtin(__builtin_rotateleft32)
+#    define OPUS_CLANG_8 (1)
+#   endif
+#  endif
+#  if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !defined(OPUS_CLANG_8)
 #   include <string.h>
 #   include <emmintrin.h>
 
diff --git a/media/libopus/moz.build b/media/libopus/moz.build
index 44c0ab7c90..c5b2021ba7 100644
--- a/media/libopus/moz.build
+++ b/media/libopus/moz.build
@@ -21,7 +21,7 @@ FINAL_LIBRARY = "gkcodecs"
 NoVisibilityFlags()
 
 DEFINES["OPUS_BUILD"] = True
-DEFINES["OPUS_VERSION"] = "ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa"
+DEFINES["OPUS_VERSION"] = "fdb198e88660721e289df94c29e91f70caff787e"
 DEFINES["USE_ALLOCA"] = True
 DEFINES["ENABLE_HARDENING"] = True
 
diff --git a/media/libopus/moz.yaml b/media/libopus/moz.yaml
index ed76d36d1f..7728a66c41 100644
--- a/media/libopus/moz.yaml
+++ b/media/libopus/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa (2024-03-04T11:53:07.000-05:00).
+  release: fdb198e88660721e289df94c29e91f70caff787e (2024-04-09T14:29:12.000-04:00).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa
+  revision: fdb198e88660721e289df94c29e91f70caff787e
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
diff --git a/media/libopus/silk/x86/NSQ_del_dec_avx2.c b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
index 43485871a4..21f00c2dad 100644
--- a/media/libopus/silk/x86/NSQ_del_dec_avx2.c
+++ b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
@@ -73,7 +73,6 @@ static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
 /* Intrinsics not defined on MSVC */
 #ifdef _MSC_VER
 #include <Intsafe.h>
-#define __m128i_u __m128i
 static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
 {
     *res = a+b;
@@ -959,7 +958,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
     {
         __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
         x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
-        _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
+        _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
     }
 
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
@@ -985,8 +984,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
         /* Scale long-term shaping state */
         for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
         {
-            __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
-            *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
+	    opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
+            _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16));
         }
 
         /* Scale long-term prediction state */
@@ -1041,13 +1040,13 @@ static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
         /* Allowing wrap around so that two wraps can cancel each other. The rare
            cases where the result wraps around can only be triggered by invalid streams*/
 
-        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
-        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&      B[0]));
+        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8]));
+        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&      B[0]));
         __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
         if (order > 10)
         {
-            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
-            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B       [8]));
+            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16]));
+            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B       [8]));
             B_v  = silk_mm256_reverse_epi32(B_v);
         }
         else
diff --git a/media/libopus/src/opus_private.h b/media/libopus/src/opus_private.h
index 364c21cebc..279f5f95f6 100644
--- a/media/libopus/src/opus_private.h
+++ b/media/libopus/src/opus_private.h
@@ -214,7 +214,7 @@ int opus_multistream_decode_native(
 
 opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions);
 
-opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data  *extensions, int nb_extensions, int pad);
+opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data  *extensions, opus_int32 nb_extensions, int pad);
 
 opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len);
 
diff --git a/media/libopus/src/repacketizer.c b/media/libopus/src/repacketizer.c
index 6a7a8b3d8e..79798b0217 100644
--- a/media/libopus/src/repacketizer.c
+++ b/media/libopus/src/repacketizer.c
@@ -155,7 +155,8 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
    /* incorporate any extensions from the repacketizer padding */
    for (i=begin;i<end;i++)
    {
-      int frame_ext_count, j;
+      int j;
+      opus_int32 frame_ext_count;
       frame_ext_count = total_ext_count - ext_count;
       int ret = opus_packet_extensions_parse(rp->paddings[i], rp->padding_len[i],
          &all_extensions[ext_count], &frame_ext_count);
diff --git a/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch
new file mode 100644
index 0000000000..4788b3996a
--- /dev/null
+++ b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch
@@ -0,0 +1,41 @@
+# HG changeset patch
+# User Chun-Min Chang <chun.m.chang@gmail.com>
+
+Bug 1888772 - Allow ARM CPU runtime detection code to build on OpenBSD
+
+diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
+--- a/vpx_ports/aarch64_cpudetect.c
++++ b/vpx_ports/aarch64_cpudetect.c
+@@ -10,30 +10,30 @@
+
+ #include "./vpx_config.h"
+ #include "arm_cpudetect.h"
+
+ #if defined(__APPLE__)
+ #include <sys/sysctl.h>
+ #endif
+
+-#if !CONFIG_RUNTIME_CPU_DETECT
++#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
+
+ static int arm_get_cpu_caps(void) {
+   // This function should actually be a no-op. There is no way to adjust any of
+   // these because the RTCD tables do not exist: the functions are called
+   // statically.
+   int flags = 0;
+ #if HAVE_NEON
+   flags |= HAS_NEON;
+ #endif  // HAVE_NEON
+   return flags;
+ }
+
+-#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
++#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
+
+ // sysctlbyname() parameter documentation for instruction set characteristics:
+ // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+ static INLINE int64_t have_feature(const char *feature) {
+   int64_t feature_present = 0;
+   size_t size = sizeof(feature_present);
+   if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+     return 0;
diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm
index 47243ad198..7a1aaf999a 100644
--- a/media/libvpx/config/generic/vpx_config.asm
+++ b/media/libvpx/config/generic/vpx_config.asm
@@ -13,6 +13,7 @@
 .equ HAVE_NEON_DOTPROD ,  0
 .equ HAVE_NEON_I8MM ,  0
 .equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
diff --git a/media/libvpx/config/generic/vpx_config.c b/media/libvpx/config/generic/vpx_config.c
index d1c3d1acd7..922edd1ea2 100644
--- a/media/libvpx/config/generic/vpx_config.c
+++ b/media/libvpx/config/generic/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512";
+static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --log=/home/cm/Work/gecko-dev/media/libvpx/config/generic/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h
index 774a531ed9..c885bb399a 100644
--- a/media/libvpx/config/generic/vpx_config.h
+++ b/media/libvpx/config/generic/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm
index ee43d0f922..6be2a7f7a2 100644
--- a/media/libvpx/config/linux/arm/vpx_config.asm
+++ b/media/libvpx/config/linux/arm/vpx_config.asm
@@ -13,6 +13,7 @@
 .equ HAVE_NEON_DOTPROD ,  0
 .equ HAVE_NEON_I8MM ,  0
 .equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
diff --git a/media/libvpx/config/linux/arm/vpx_config.c b/media/libvpx/config/linux/arm/vpx_config.c
index c885d910c0..c634e2af66 100644
--- a/media/libvpx/config/linux/arm/vpx_config.c
+++ b/media/libvpx/config/linux/arm/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only";
+static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h
index bfd2c04e07..99a55f0ea9 100644
--- a/media/libvpx/config/linux/arm/vpx_config.h
+++ b/media/libvpx/config/linux/arm/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h
index 738de4f9f4..b7d828d446 100644
--- a/media/libvpx/config/linux/arm64/vp9_rtcd.h
+++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h
@@ -35,11 +35,13 @@ extern "C" {
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_neon
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
 int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
@@ -96,6 +98,10 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_block_error = vp9_block_error_neon;
+    if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve;
+    vp9_block_error_fp = vp9_block_error_fp_neon;
+    if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve;
 }
 #endif
 
diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm
index 499c16202c..c51a76b3f6 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.asm
+++ b/media/libvpx/config/linux/arm64/vpx_config.asm
@@ -13,6 +13,7 @@
 .equ HAVE_NEON_DOTPROD ,  1
 .equ HAVE_NEON_I8MM ,  1
 .equ HAVE_SVE ,  1
+.equ HAVE_SVE2 ,  1
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c
index 74baa0689c..c0d714503f 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.c
+++ b/media/libvpx/config/linux/arm64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only";
+static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm64/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h
index 3c5f2e33ca..12251ee0c1 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.h
+++ b/media/libvpx/config/linux/arm64/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 1
 #define HAVE_NEON_I8MM 1
 #define HAVE_SVE 1
+#define HAVE_SVE2 1
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
index 5a9b05ca14..2c31ee4ef9 100644
--- a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
@@ -916,7 +916,8 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t di
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size);
-#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size);
+RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
 void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -1148,6 +1149,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod;
     vpx_sse = vpx_sse_neon;
     if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod;
+    vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon;
+    if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve;
     vpx_variance16x16 = vpx_variance16x16_neon;
     if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod;
     vpx_variance16x32 = vpx_variance16x32_neon;
diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm
index eaa3950d37..5a92abf939 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.asm
+++ b/media/libvpx/config/linux/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/ia32/vpx_config.c b/media/libvpx/config/linux/ia32/vpx_config.c
index 6805ab62a8..7024ca989f 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.c
+++ b/media/libvpx/config/linux/ia32/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/ia32/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h
index 69fd63bf02..b4cc10a906 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.h
+++ b/media/libvpx/config/linux/ia32/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm
index 8715768a2e..148a894979 100644
--- a/media/libvpx/config/linux/x64/vpx_config.asm
+++ b/media/libvpx/config/linux/x64/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/x64/vpx_config.c b/media/libvpx/config/linux/x64/vpx_config.c
index e4dcb394c3..f38bd16290 100644
--- a/media/libvpx/config/linux/x64/vpx_config.c
+++ b/media/libvpx/config/linux/x64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/x64/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h
index ab4439aaf4..d91509ad10 100644
--- a/media/libvpx/config/linux/x64/vpx_config.h
+++ b/media/libvpx/config/linux/x64/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm
index eaa3950d37..5a92abf939 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.asm
+++ b/media/libvpx/config/mac/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/ia32/vpx_config.c b/media/libvpx/config/mac/ia32/vpx_config.c
index 3e5d3ec0f3..2ee9d0ebb0 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.c
+++ b/media/libvpx/config/mac/ia32/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/ia32/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h
index 69fd63bf02..b4cc10a906 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.h
+++ b/media/libvpx/config/mac/ia32/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm
index 8715768a2e..148a894979 100644
--- a/media/libvpx/config/mac/x64/vpx_config.asm
+++ b/media/libvpx/config/mac/x64/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/x64/vpx_config.c b/media/libvpx/config/mac/x64/vpx_config.c
index 9a06646fdc..51fceeb6e3 100644
--- a/media/libvpx/config/mac/x64/vpx_config.c
+++ b/media/libvpx/config/mac/x64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/x64/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h
index ab4439aaf4..d91509ad10 100644
--- a/media/libvpx/config/mac/x64/vpx_config.h
+++ b/media/libvpx/config/mac/x64/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm
index 24eb1a8cba..32d700f1bb 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.asm
+++ b/media/libvpx/config/win/aarch64/vpx_config.asm
@@ -12,7 +12,8 @@
 .equ HAVE_NEON ,  1
 .equ HAVE_NEON_DOTPROD ,  1
 .equ HAVE_NEON_I8MM ,  1
-.equ HAVE_SVE ,  1
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c
index 13cc13a95d..b8f4ec8754 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.c
+++ b/media/libvpx/config/win/aarch64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only";
+static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --disable-sve --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/aarch64/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h
index c3cc860f18..a81f868053 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.h
+++ b/media/libvpx/config/win/aarch64/vpx_config.h
@@ -21,7 +21,8 @@
 #define HAVE_NEON 1
 #define HAVE_NEON_DOTPROD 1
 #define HAVE_NEON_I8MM 1
-#define HAVE_SVE 1
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm
index cb1aa7ce6a..9c7e3ce2c2 100755
--- a/media/libvpx/config/win/ia32/vpx_config.asm
+++ b/media/libvpx/config/win/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/win/ia32/vpx_config.c b/media/libvpx/config/win/ia32/vpx_config.c
index 33c836213b..8cdd6c30b2 100644
--- a/media/libvpx/config/win/ia32/vpx_config.c
+++ b/media/libvpx/config/win/ia32/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/ia32/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h
index 9fe256f4ad..b62188c71c 100644
--- a/media/libvpx/config/win/ia32/vpx_config.h
+++ b/media/libvpx/config/win/ia32/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm
index a1d34d6d37..d5f5f3968e 100644
--- a/media/libvpx/config/win/x64/vpx_config.asm
+++ b/media/libvpx/config/win/x64/vpx_config.asm
@@ -10,6 +10,7 @@
 %define HAVE_NEON_DOTPROD 0
 %define HAVE_NEON_I8MM 0
 %define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c
index 8c04c1a3cf..57904c7dc6 100644
--- a/media/libvpx/config/win/x64/vpx_config.c
+++ b/media/libvpx/config/win/x64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/x64/config.log";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h
index 068c6d2a99..448f13e4a1 100644
--- a/media/libvpx/config/win/x64/vpx_config.h
+++ b/media/libvpx/config/win/x64/vpx_config.h
@@ -22,6 +22,7 @@
 #define HAVE_NEON_DOTPROD 0
 #define HAVE_NEON_I8MM 0
 #define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh
index ef9bc696f3..4efcb54aa1 100755
--- a/media/libvpx/generate_sources_mozbuild.sh
+++ b/media/libvpx/generate_sources_mozbuild.sh
@@ -169,7 +169,8 @@ function gen_rtcd_header {
 # $1 - Header file directory.
 # $2 - Config command line.
 function gen_config_files {
-  ./configure $2 > /dev/null
+  ./configure $2 --log=$BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log > /dev/null
+  echo "Log file: $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log"
 
   # Disable HAVE_UNISTD_H.
   ( echo '/HAVE_UNISTD_H'; echo 'd' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h
@@ -203,6 +204,7 @@ all_platforms="${all_platforms} --disable-avx512"
 x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm"
 arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only"
 arm64_platforms="--enable-realtime-only"
+disable_sve="--disable-sve" # Bug 1885585
 
 gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}"
 gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}"
@@ -213,7 +215,7 @@ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platfor
 
 gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
 gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}"
-gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}"
+gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms} ${disable_sve}" # Bug 1885585
 
 gen_config_files generic "--target=generic-gnu ${all_platforms}"
 
@@ -236,7 +238,7 @@ gen_rtcd_header win/ia32 x86
 
 gen_rtcd_header linux/arm armv7
 gen_rtcd_header linux/arm64 arm64
-gen_rtcd_header win/aarch64 arm64
+gen_rtcd_header win/aarch64 arm64 $disable_sve # Bug 1885585
 
 gen_rtcd_header generic generic
 
@@ -275,6 +277,7 @@ config=$(print_config linux/arm64)
 make_clean
 make libvpx_srcs.txt target=libs $config > /dev/null
 convert_srcs_to_project_files libvpx_srcs.txt ARM64
+# Bug 1885585: The sve files will be excluded from the win/aarch64 build in moz.build.
 
 echo "Generate generic source list."
 config=$(print_config generic)
diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch
index 1cb33e192f..37f755e022 100644
--- a/media/libvpx/input_frame_validation.patch
+++ b/media/libvpx/input_frame_validation.patch
@@ -8,15 +8,15 @@ MozReview-Commit-ID: BxDCnJe0mzs
 diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
 --- a/vp8/vp8_cx_iface.c
 +++ b/vp8/vp8_cx_iface.c
-@@ -921,20 +921,29 @@ static vpx_codec_err_t vp8e_encode(vpx_c
-     dst_time_stamp =
-         pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-     dst_end_time_stamp = (pts_val + (int64_t)duration) *
-                          ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+@@ -989,20 +989,29 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+             &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+             "conversion of relative pts + duration to ticks would overflow");
+       }
+       dst_end_time_stamp =
+           pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
 
-     if (img != NULL) {
        res = image2yuvconfig(img, &sd);
- 
+
 -      if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
 -                                dst_time_stamp, dst_end_time_stamp)) {
 -        VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap
index bb0ddd95b2..7206b5ebec 100644
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 Johann <johann@duck.com> <johann.koenig@gmail.com>
@@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
 Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS
index 2db4a113e4..5515e26589 100644
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
 Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
@@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
 Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
@@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com>
 Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
@@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
 Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net>
 Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
 Linfeng Zhang <linfengz@google.com>
 Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
@@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
 Neil Birkbeck <neil.birkbeck@gmail.com>
 Nico Weber <thakis@chromium.org>
 Niveditha Rau <niveditha.rau@gmail.com>
@@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Wan-Teh Chang <wtc@google.com>
 Wonkap Jang <wonkap@google.com>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG
index 21070785ed..87f0d7f708 100644
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@@ -1,7 +1,79 @@
-20yy-mm-dd v1.14.0 "V Duck"
+2024-01-02 v1.14.0 "Venetian Duck"
   This release drops support for old C compilers, such as Visual Studio 2012
   and older, that disallow mixing variable declarations and statements (a C99
-  feature).
+  feature). It adds support for run-time CPU feature detection for Arm
+  platforms, as well as support for darwin23 (macOS 14).
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    Various new features for rate control library for real-time: SVC parallel
+    encoding, loopfilter level, support for frame dropping, and screen content.
+
+    New callback function send_tpl_gop_stats for vp9 external rate control
+    library, which can be used to transmit TPL stats for a group of pictures. A
+    public header vpx_tpl.h is added for the definition of TPL stats used in
+    this callback.
+
+    libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+  - Enhancement:
+    Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+    68%-151% speed up for high bitdepth.
+
+    Improvements on AVX2 and SSE optimizations.
+    Improvements on LSX optimizations for LoongArch.
+    42-49% speedup on speed 0 VoD encoding.
+    Android API level predicates.
+
+  - Bug fixes:
+    Fix to missing prototypes from the rtcd header.
+    Fix to segfault when total size is enlarged but width is smaller.
+    Fix to the build for arm64ec using MSVC.
+    Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+    Fix to -Wshadow warnings.
+    Fix to heap overflow in vpx_get4x4sse_cs_neon.
+    Fix to buffer overrun in highbd Neon subpel variance filters.
+    Added bitexact encode test script.
+    Fix to -Wl,-z,defs with Clang's sanitizers.
+    Fix to decoder stability after error & continued decoding.
+    Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+    Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+    Fix to fragments count before use.
+    Fix to a case where target bandwidth is 0 for SVC.
+    Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+    Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+    Fix to integer overflow in vp8,ratectrl.c.
+    Fix to integer overflow in vp9 svc.
+    Fix to avg_frame_bandwidth overflow.
+    Fix to per frame qp for temporal layers.
+    Fix to unsigned integer overflow in sse computation.
+    Fix to uninitialized mesh feature for BEST mode.
+    Fix to overflow in highbd temporal_filter.
+    Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+    Skip arm64_neon.h workaround w/VS >= 2019.
+    Fix to c vs avx mismatch of diamond_search_sad().
+    Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+    Fix to a bug in vpx_hadamard_32x32_neon().
+    Fix to Clang -Wunreachable-code-aggressive warnings.
+    Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+    Fix to -Wunreachable-code in mfqe_partition.
+    Force mode search on 64x64 if no mode is selected.
+    Fix to ubsan failure caused by left shift of negative.
+    Fix to integer overflow in calc_pframe_target_size.
+    Fix to float-cast-overflow in vp8_change_config().
+    Fix to a null ptr before use.
+    Conditionally skip using inter frames in speed features.
+    Remove invalid reference frames.
+    Disable intra mode search speed features conditionally.
+    Set nonrd keyframe under dynamic change of deadline for rtc.
+    Fix to scaled reference offsets.
+    Set skip_recode=0 in nonrd_pick_sb_modes.
+    Fix to an edge case when downsizing to one.
+    Fix to a bug in frame scaling.
+    Fix to pred buffer stride.
+    Fix to a bug in simple motion search.
+    Update frame size in actual encoding.
 
 2023-09-29 v1.13.1 "Ugly Duckling"
   This release contains two security related fixes. One each for VP8 and VP9.
diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README
index 4c25b15d81..6dbd164c34 100644
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@@ -1,5 +1,3 @@
-v1.13.1 Ugly Duckling
-
 Welcome to the WebM VP8/VP9 Codec SDK!
 
 COMPILING THE APPLICATIONS/LIBRARIES:
@@ -183,6 +181,44 @@ CODE STYLE:
 
   See also: http://clang.llvm.org/docs/ClangFormat.html
 
+PROFILE GUIDED OPTIMIZATION (PGO)
+  Profile Guided Optimization can be enabled for Clang builds using the
+  commands:
+
+  $ export CC=clang
+  $ export CXX=clang++
+  $ ../libvpx/configure  --enable-profile
+  $ make
+
+  Generate one or multiple PGO profile files by running vpxdec or vpxenc. For
+  example:
+
+  $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \
+    -o - > /dev/null
+
+  To convert and merge the raw profile files, use the llvm-profdata tool:
+
+  $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw
+
+  Then, rebuild the project with the new profile file:
+
+  $ make clean
+  $ ../libvpx/configure --use-profile=perf.profdata
+  $ make
+
+  Note: Always use the llvm-profdata from the toolchain that is used for
+  compiling the PGO-enabled binary.
+
+  To observe the improvements from a PGO-enabled build, enable and compare the
+  list of failed optimizations by using the -Rpass-missed compiler flag. For
+  example, to list the failed loop vectorizations:
+
+  $ ../libvpx/configure --use-profile=perf.profdata \
+    --extra-cflags=-Rpass-missed=loop-vectorize
+
+  For guidance on utilizing PGO files to identify potential optimization
+  opportunities, see: tools/README.pgo.md
+
 SUPPORT
   This library is an open source project supported by its community. Please
   email webm-discuss@webmproject.org for help.
diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk
index ba24f541b1..533f43c1c2 100644
--- a/media/libvpx/libvpx/build/make/Android.mk
+++ b/media/libvpx/libvpx/build/make/Android.mk
@@ -15,13 +15,9 @@ ifdef NDK_ROOT
 # In an Android project place a libvpx checkout in the jni directory.
 # Run the configure script from the jni directory.  Base libvpx
 # encoder/decoder configuration will look similar to:
-# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+# ./libvpx/configure --target=arm64-android-gcc --disable-examples \
 #                    --enable-external-build
 #
-# When targeting Android, realtime-only is enabled by default.  This can
-# be overridden by adding the command line flag:
-#  --disable-realtime-only
-#
 # This will create .mk files that contain variables that contain the
 # source files to compile.
 #
@@ -38,11 +34,14 @@ ifdef NDK_ROOT
 # but the resulting library *must* be run on devices supporting all of the
 # enabled extensions. They can be disabled individually with
 #   --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
-#   --disable-neon[-asm]
+#   --disable-neon{, -asm, -neon-dotprod, -neon-i8mm}
+#   --disable-sve
 #   --disable-{dspr2, msa}
 
 #
-# Running ndk-build will build libvpx and include it in your project.
+# Running ndk-build will build libvpx and include it in your project. Set
+# APP_ABI to match the --target passed to configure:
+# https://developer.android.com/ndk/guides/application_mk#app_abi.
 #
 
 CONFIG_DIR := $(LOCAL_PATH)/
diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile
index 199ed78058..658b37617b 100644
--- a/media/libvpx/libvpx/build/make/Makefile
+++ b/media/libvpx/libvpx/build/make/Makefile
@@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
 $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
 $(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
 $(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2
+$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2
 
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh
index 869793a296..009bf7db5c 100644
--- a/media/libvpx/libvpx/build/make/configure.sh
+++ b/media/libvpx/libvpx/build/make/configure.sh
@@ -74,6 +74,8 @@ Build options:
   --cpu=CPU                   optimize for a specific cpu rather than a family
   --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
   --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+  --use-profile=PROFILE_FILE
+                              Use PROFILE_FILE for PGO
   ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
   ${toggle_werror}            treat warnings as errors, if possible
                               (not available with all compilers)
@@ -81,6 +83,7 @@ Build options:
   ${toggle_pic}               turn on/off Position Independent Code
   ${toggle_ccache}            turn on/off compiler cache
   ${toggle_debug}             enable/disable debug mode
+  ${toggle_profile}           enable/disable profiling
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
   ${toggle_gcov}              enable/disable gcov coverage instrumentation
   ${toggle_thumb}             enable/disable building arm assembly in thumb mode
@@ -429,6 +432,26 @@ check_gcc_machine_options() {
   fi
 }
 
+check_neon_sve_bridge_compiles() {
+  if enabled sve; then
+    check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+EOF
+    compile_result=$?
+    if [ ${compile_result} -ne 0 ]; then
+      log_echo "  disabling sve: arm_neon_sve_bridge.h not supported by compiler"
+      log_echo "  disabling sve2: arm_neon_sve_bridge.h not supported by compiler"
+      disable_feature sve
+      disable_feature sve2
+      RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 "
+    fi
+  fi
+}
+
 check_gcc_avx512_compiles() {
   if disabled gcc; then
     return
@@ -611,6 +634,9 @@ process_common_cmdline() {
       --extra-cxxflags=*)
         extra_cxxflags="${optval}"
         ;;
+      --use-profile=*)
+        pgo_file=${optval}
+        ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if is_in ${option} ${ARCH_EXT_LIST}; then
@@ -951,7 +977,7 @@ EOF
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin2[0-2]-*)
+    *-darwin2[0-3]-*)
       add_cflags  "-arch ${toolchain%%-*}"
       add_ldflags "-arch ${toolchain%%-*}"
       ;;
@@ -980,36 +1006,18 @@ EOF
   case ${toolchain} in
     arm*)
       soft_enable runtime_cpu_detect
-      # Arm ISA extensions are treated as supersets.
-      case ${tgt_isa} in
-        arm64|armv8)
-          for ext in ${ARCH_EXT_LIST_AARCH64}; do
-            # Disable higher order extensions to simplify dependencies.
-            if [ "$disable_exts" = "yes" ]; then
-              if ! disabled $ext; then
-                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
-                disable_feature $ext
-              fi
-            elif disabled $ext; then
-              disable_exts="yes"
-            else
-              soft_enable $ext
-            fi
-          done
-          ;;
-        armv7|armv7s)
-          soft_enable neon
-          # Only enable neon_asm when neon is also enabled.
-          enabled neon && soft_enable neon_asm
-          # If someone tries to force it through, die.
-          if disabled neon && enabled neon_asm; then
-            die "Disabling neon while keeping neon-asm is not supported"
-          fi
-          ;;
-      esac
 
-      asm_conversion_cmd="cat"
+      if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+        soft_enable neon
+        # Only enable neon_asm when neon is also enabled.
+        enabled neon && soft_enable neon_asm
+        # If someone tries to force it through, die.
+        if disabled neon && enabled neon_asm; then
+          die "Disabling neon while keeping neon-asm is not supported"
+        fi
+      fi
 
+      asm_conversion_cmd="cat"
       case ${tgt_cc} in
         gcc)
           link_with_cc=gcc
@@ -1228,6 +1236,38 @@ EOF
           fi
           ;;
       esac
+
+      # AArch64 ISA extensions are treated as supersets.
+      if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+        aarch64_arch_flag_neon="arch=armv8-a"
+        aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod"
+        aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm"
+        aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve"
+        aarch64_arch_flag_sve2="arch=armv9-a+sve2"
+        for ext in ${ARCH_EXT_LIST_AARCH64}; do
+          if [ "$disable_exts" = "yes" ]; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            soft_disable $ext
+          else
+            # Check the compiler supports the -march flag for the extension.
+            # This needs to happen after toolchain/OS inspection so we handle
+            # $CROSS etc correctly when checking for flags, else these will
+            # always fail.
+            flag="$(eval echo \$"aarch64_arch_flag_${ext}")"
+            check_gcc_machine_option "${flag}" "${ext}"
+            if ! enabled $ext; then
+              # Disable higher order extensions to simplify dependencies.
+              disable_exts="yes"
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+              soft_disable $ext
+            fi
+          fi
+        done
+        if enabled sve; then
+          check_neon_sve_bridge_compiles
+        fi
+      fi
+
       ;;
     mips*)
       link_with_cc=gcc
@@ -1484,6 +1524,14 @@ EOF
       ;;
   esac
 
+  # Enable PGO
+  if [ -n "${pgo_file}" ]; then
+   check_add_cflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by compiler"
+   check_add_ldflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by linker"
+  fi
+
   # Try to enable CPU specific tuning
   if [ -n "${tune_cpu}" ]; then
     if [ -n "${tune_cflags}" ]; then
@@ -1504,6 +1552,9 @@ EOF
   else
     check_add_cflags -DNDEBUG
   fi
+  enabled profile &&
+    check_add_cflags -fprofile-generate &&
+    check_add_ldflags -fprofile-generate
 
   enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
   enabled gcov &&
diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl
index 0b9e16738e..025238d678 100755
--- a/media/libvpx/libvpx/build/make/rtcd.pl
+++ b/media/libvpx/libvpx/build/make/rtcd.pl
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/);
   @REQUIRES = filter(qw/neon/);
   &require(@REQUIRES);
   arm;
diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
index b212e0709d..97e78996e8 100755
--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64="
     neon_dotprod
     neon_i8mm
     sve
+    sve2
 "
 
 ARCH_EXT_LIST_X86="
@@ -376,6 +377,7 @@ CMDLINE_SELECT="
     install_libs
     install_srcs
     debug
+    profile
     gprof
     gcov
     pic
@@ -659,6 +661,7 @@ process_toolchain() {
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
         check_add_cflags -Wshadow
+        check_add_cflags -Wstrict-prototypes
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
@@ -677,6 +680,10 @@ process_toolchain() {
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
 
+        # Do not allow implicit vector type conversions on Clang builds (this
+        # is already the default on GCC builds).
+        check_add_cflags -flax-vector-conversions=none
+
         # Quiet gcc 6 vs 7 abi warnings:
         # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
         if enabled arm; then
diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c
index 5fb63e1660..083bd2519d 100644
--- a/media/libvpx/libvpx/examples/resize_util.c
+++ b/media/libvpx/libvpx/examples/resize_util.c
@@ -20,7 +20,7 @@
 
 static const char *exec_name = NULL;
 
-static void usage() {
+static void usage(void) {
   printf("Usage:\n");
   printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
          exec_name);
diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
index 998e4fb20d..4050c093cd 100644
--- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) {
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
       vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
       // Don't look for mismatch on top spatial and top temporal layers as they
-      // are non reference frames.
+      // are non reference frames. Don't look at frames whose top spatial layer
+      // is dropped.
       if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+          cx_pkt->data.frame
+              .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] &&
           !(layer_id.temporal_layer_id > 0 &&
-            layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 &&
-            cx_pkt->data.frame
-                .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) {
+            layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) {
         test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
       }
 #endif
diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
index 1a0823153b..6e12d668b0 100644
--- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
@@ -60,7 +60,7 @@
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <width> <height> <infile> <outfile> "
           "<frame> <limit(optional)>\n",
diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template
index 1ee442af3e..6d05162d00 100644
--- a/media/libvpx/libvpx/libs.doxy_template
+++ b/media/libvpx/libvpx/libs.doxy_template
@@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-
-DOT_TRANSPARENT        = YES
-
 # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10)
diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk
index ff1c569c3b..5964386710 100644
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 8
+SO_VERSION_MAJOR := 9
 SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 1
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 SHARED_LIB_SUF          := .dylib
diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py
index 1c69740d2b..98ce7b1947 100644
--- a/media/libvpx/libvpx/test/android/get_files.py
+++ b/media/libvpx/libvpx/test/android/get_files.py
@@ -38,7 +38,7 @@ def get_file_sha(filename):
         buf = file.read(HASH_CHUNK)
       return sha_hash.hexdigest()
   except IOError:
-    print "Error reading " + filename
+    print("Error reading " + filename)
 
 # Downloads a file from a url, and then checks the sha against the passed
 # in sha
@@ -67,7 +67,7 @@ try:
       getopt.getopt(sys.argv[1:], \
                     "u:i:o:", ["url=", "input_csv=", "output_dir="])
 except:
-  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  print('get_files.py -u <url> -i <input_csv> -o <output_dir>')
   sys.exit(2)
 
 for opt, arg in opts:
@@ -79,7 +79,7 @@ for opt, arg in opts:
     local_resource_path = os.path.join(arg)
 
 if len(sys.argv) != 7:
-  print "Expects two paths and a url!"
+  print("Expects two paths and a url!")
   exit(1)
 
 if not os.path.isdir(local_resource_path):
@@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb")
 
 # Our 'csv' file uses multiple spaces as a delimiter, python's
 # csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \
     for line in file_list_csv), delimiter = ' ')
 
 file_shas = []
@@ -104,15 +104,16 @@ for row in file_list_reader:
 file_list_csv.close()
 
 # Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
+for filename, sha in zip(file_names, file_shas):
+  filename = filename.lstrip('*')
   path = os.path.join(local_resource_path, filename)
   if os.path.isfile(path) \
       and get_file_sha(path) == sha:
-    print path + ' exists, skipping'
+    print(path + ' exists, skipping')
     continue
   for retry in range(0, ftp_retries):
-    print "Downloading " + path
+    print("Downloading " + path)
     if not download_and_check_sha(url, filename, sha):
-      print "Sha does not match, retrying..."
+      print("Sha does not match, retrying...")
     else:
       break
diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc
index ede9c0ba8c..7816912ff7 100644
--- a/media/libvpx/libvpx/test/avg_test.cc
+++ b/media/libvpx/libvpx/test/avg_test.cc
@@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(1024, &vp9_block_error_fp_neon)));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve),
+                      make_tuple(64, &vp9_block_error_fp_sve),
+                      make_tuple(256, &vp9_block_error_fp_sve),
+                      make_tuple(1024, &vp9_block_error_fp_sve)));
+#endif  // HAVE_SVE
+
 #if HAVE_MSA
 INSTANTIATE_TEST_SUITE_P(
     MSA, AverageTest,
diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h
index c7e8f54847..179ccdf011 100644
--- a/media/libvpx/libvpx/test/codec_factory.h
+++ b/media/libvpx/libvpx/test/codec_factory.h
@@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8;
               &libvpx_test::kVP8)),                                         \
           __VA_ARGS__))
 #else
-#define VP8_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "")
 #endif  // CONFIG_VP8
 
 /*
@@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9;
               &libvpx_test::kVP9)),                                         \
           __VA_ARGS__))
 #else
-#define VP9_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "")
 #endif  // CONFIG_VP9
 
 }  // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc
index ffd5c41c63..11f7625137 100644
--- a/media/libvpx/libvpx/test/convolve_test.cc
+++ b/media/libvpx/libvpx/test/convolve_test.cc
@@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12)
 WRAP(convolve8_avg_neon, 12)
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+WRAP(convolve8_horiz_sve, 8)
+WRAP(convolve8_avg_horiz_sve, 8)
+WRAP(convolve8_horiz_sve, 10)
+WRAP(convolve8_avg_horiz_sve, 10)
+WRAP(convolve8_horiz_sve, 12)
+WRAP(convolve8_avg_horiz_sve, 12)
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+WRAP(convolve8_vert_sve2, 8)
+WRAP(convolve8_avg_vert_sve2, 8)
+WRAP(convolve8_vert_sve2, 10)
+WRAP(convolve8_avg_vert_sve2, 10)
+WRAP(convolve8_vert_sve2, 12)
+WRAP(convolve8_avg_vert_sve2, 12)
+#endif  // HAVE_SVE2
+
 WRAP(convolve_copy_c, 8)
 WRAP(convolve_avg_c, 8)
 WRAP(convolve8_horiz_c, 8)
@@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
 #endif  // HAVE_NEON_DOTPROD
 
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8,
+    wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve),
+                                             ALL_SIZES(convolve10_sve),
+                                             ALL_SIZES(convolve12_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve2(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8,
+    wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve2(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10,
+    wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve2(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12,
+    wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2),
+                                              ALL_SIZES(convolve10_sve2),
+                                              ALL_SIZES(convolve12_sve2) };
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE2
+
 #if HAVE_NEON_I8MM
 const ConvolveFunctions convolve8_neon_i8mm(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc
index 508083673a..ca3b17a5d5 100644
--- a/media/libvpx/libvpx/test/encode_api_test.cc
+++ b/media/libvpx/libvpx/test/encode_api_test.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <cassert>
 #include <climits>
+#include <cstdint>
 #include <cstring>
 #include <initializer_list>
 #include <new>
@@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) {
          0;
 }
 
+void *Memset16(void *dest, int val, size_t length) {
+  uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest);
+  for (size_t i = 0; i < length; i++) {
+    *dest16++ = val;
+  }
+  return dest;
+}
+
+vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt,
+                         unsigned int width, unsigned int height) {
+  assert(fmt != VPX_IMG_FMT_NV12);
+  if (bit_depth > VPX_BITS_8) {
+    fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+  }
+  vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1);
+  if (!image) return image;
+
+  const int val = 1 << (bit_depth - 1);
+  const unsigned int uv_h =
+      (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift;
+  const unsigned int uv_w =
+      (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift;
+  if (bit_depth > VPX_BITS_8) {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      Memset16(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      Memset16(image->planes[1] + i * image->stride[1], val, uv_w);
+      Memset16(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  } else {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      memset(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      memset(image->planes[1] + i * image->stride[1], val, uv_w);
+      memset(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  }
+
+  return image;
+}
+
 TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
@@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) {
   ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
 
   // Generate random frame data and encode
-  uint8_t img[1280 * 720 * 3 / 2];
+  libvpx_test::RandomVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.SetImageFormat(VPX_IMG_FMT_I420);
+  video.Begin();
+  ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_BEST_QUALITY),
+            VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
+
+TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_threads = 1;
+  cfg.g_profile = 0;
+  cfg.g_w = 1;
+  cfg.g_h = 64;
+  cfg.g_bit_depth = VPX_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_dropframe_thresh = 0;  // Don't drop frames
+  cfg.rc_resize_allowed = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.rc_target_bitrate = 10;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = 10000;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  uint8_t img[1 * 64 * 3 / 2];
   libvpx_test::ACMRandom rng;
   for (size_t i = 0; i < sizeof(img); ++i) {
     img[i] = rng.Rand8();
@@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) {
   ASSERT_EQ(
       vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
       &img_wrapper);
-  ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+  vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 4294967;
+  // Set the scalability mode to L1T3.
+  cfg.ts_number_layers = 3;
+  cfg.ts_periodicity = 4;
+  cfg.ts_layer_id[0] = 0;
+  cfg.ts_layer_id[1] = 2;
+  cfg.ts_layer_id[2] = 1;
+  cfg.ts_layer_id[3] = 2;
+  cfg.ts_rate_decimator[0] = 4;
+  cfg.ts_rate_decimator[1] = 2;
+  cfg.ts_rate_decimator[2] = 1;
+  // Bitrate allocation L0: 50% L1: 20% L2: 30%
+  cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] =
+      50 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] =
+      70 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] =
+      cfg.rc_target_bitrate;
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2),
             VPX_CODEC_OK);
 
+  constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING =
+      VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+  // Layer 2: only reference last frame, no updates
+  // It only depends on layer 0
+  flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
   // Destroy libvpx encoder
   vpx_codec_destroy(&enc);
 }
-#endif
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP8Encoder {
+ public:
+  explicit VP8Encoder(int speed) : speed_(speed) {}
+  ~VP8Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP8Encoder::~VP8Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP8Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           vpx_enc_deadline_t deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP8Encoder::Encode(bool key_frame) {
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_),
+            VPX_CODEC_OK);
+  ++frame_index_;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+    if (key_frame) {
+      ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+    }
+  }
+  vpx_img_free(image);
+}
+
+// This is the reproducer testcase for crbug.com/324459561. However,
+// just running this test is not enough to reproduce the bug. We also
+// need to send signals to the test.
+TEST(EncodeAPI, Chromium324459561) {
+  VP8Encoder encoder(-12);
+
+  encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME);
+
+  encoder.Encode(true);
+  encoder.Encode(true);
+  encoder.Encode(true);
+
+  encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME);
+}
+#endif  // CONFIG_VP8_ENCODER
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
 // invalid configuration by setting the temporal layer rate allocation
@@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
   }
 }
 
+TEST(EncodeAPI, ConfigResizeBiggerAfterInit) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+    EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1080;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsSmallerThanInitialPts) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    // pts (10) is smaller than the initial pts (12).
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsOrDurationTooBig) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts, when converted to ticks, is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+    // pts + duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts + duration, when converted to ticks, is too big.
+#if ULONG_MAX > INT64_MAX
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
 #if CONFIG_VP9_ENCODER
 // Frame size needed to trigger the overflow exceeds the max buffer allowed on
 // 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
@@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
 }
 #endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
 
-vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
-  vpx_image_t *image =
-      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
-  if (!image) return image;
-
-  for (unsigned int i = 0; i < image->d_h; ++i) {
-    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
-  }
-  const unsigned int uv_h = (image->d_h + 1) / 2;
-  const unsigned int uv_w = (image->d_w + 1) / 2;
-  for (unsigned int i = 0; i < uv_h; ++i) {
-    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
-    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
-  }
-
-  return image;
-}
-
 // Emulates the WebCodecs VideoEncoder interface.
 class VP9Encoder {
  public:
-  explicit VP9Encoder(int speed) : speed_(speed) {}
+  explicit VP9Encoder(int speed)
+      : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {}
+  // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set.
+  // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before
+  // passing the image format to vpx_img_alloc().
+  VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt)
+      : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {}
   ~VP9Encoder();
 
   void Configure(unsigned int threads, unsigned int width, unsigned int height,
@@ -558,6 +889,8 @@ class VP9Encoder {
 
  private:
   const int speed_;
+  const vpx_bit_depth_t bit_depth_;
+  const vpx_img_fmt_t fmt_;
   bool initialized_ = false;
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_ctx_t enc_;
@@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
   deadline_ = deadline;
 
   if (!initialized_) {
+    ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0);
+    const bool high_bit_depth = bit_depth_ > VPX_BITS_8;
+    const bool is_420 = fmt_ == VPX_IMG_FMT_I420;
     vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
     ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
               VPX_CODEC_OK);
     cfg_.g_threads = threads;
+    // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3,
+    // all other subsampling formats are allowed. In profiles 0 and 1, only bit
+    // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are
+    // allowed.
+    cfg_.g_profile = 2 * high_bit_depth + !is_420;
     cfg_.g_w = width;
     cfg_.g_h = height;
+    cfg_.g_bit_depth = bit_depth_;
+    cfg_.g_input_bit_depth = bit_depth_;
     cfg_.g_timebase.num = 1;
     cfg_.g_timebase.den = 1000 * 1000;  // microseconds
     cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
     cfg_.rc_end_usage = end_usage;
     cfg_.rc_min_quantizer = 2;
     cfg_.rc_max_quantizer = 58;
-    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(
+        vpx_codec_enc_init(&enc_, iface, &cfg_,
+                           high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0),
+        VPX_CODEC_OK);
     ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
     initialized_ = true;
     return;
@@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
 
 void VP9Encoder::Encode(bool key_frame) {
   const vpx_codec_cx_pkt_t *pkt;
-  vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+  vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h);
   ASSERT_NE(image, nullptr);
   const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
   ASSERT_EQ(
       vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
       VPX_CODEC_OK);
-  frame_index_++;
+  ++frame_index_;
   vpx_codec_iter_t iter = nullptr;
   while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
     ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
@@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) {
   encoder.Encode(false);
   encoder.Encode(false);
 }
+
+TEST(EncodeAPI, Buganizer317105128) {
+  VP9Encoder encoder(-9);
+  encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer319964497) {
+  VP9Encoder encoder(7);
+  encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc
index eea5647a78..6306e4f2ca 100644
--- a/media/libvpx/libvpx/test/frame_size_tests.cc
+++ b/media/libvpx/libvpx/test/frame_size_tests.cc
@@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
 // size or almost 1 gig of memory.
 // In total the allocations will exceed 2GiB which may cause a failure with
 // mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+#if defined(_WIN32) && !defined(_WIN64)
   video.SetSize(4096, 3072);
 #else
   video.SetSize(4096, 4096);
diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc
index f66f00b5c1..353c5043eb 100644
--- a/media/libvpx/libvpx/test/init_vpx_test.cc
+++ b/media/libvpx/libvpx/test/init_vpx_test.cc
@@ -57,6 +57,9 @@ void init_vpx_test() {
   if (!(caps & HAS_SVE)) {
     append_negative_gtest_filter(":SVE.*:SVE/*");
   }
+  if (!(caps & HAS_SVE2)) {
+    append_negative_gtest_filter(":SVE2.*:SVE2/*");
+  }
 #elif VPX_ARCH_ARM
   const int caps = arm_cpu_caps();
   if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc
index 20ad2229b4..f27bd7ebbc 100644
--- a/media/libvpx/libvpx/test/resize_test.cc
+++ b/media/libvpx/libvpx/test/resize_test.cc
@@ -7,8 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
-
 #include <climits>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
   mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
+#if CONFIG_VP9_DECODER
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
   int resize_count = 0;
@@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
     }
   }
 
-#if CONFIG_VP9_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
@@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc
index d3c76a34d2..57037f1e30 100644
--- a/media/libvpx/libvpx/test/sum_squares_test.cc
+++ b/media/libvpx/libvpx/test/sum_squares_test.cc
@@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_sum_squares_2d_i16_neon)));
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_sve)));
+#endif  // HAVE_SVE
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
     SSE2, SumSquaresTest,
diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc
index b8320e9ceb..5cf6a5fb8e 100644
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@@ -29,6 +29,9 @@ namespace {
 
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
+typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                uint32_t *sse, int *sum);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
 
 using libvpx_test::ACMRandom;
@@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) {
  *  Our codebase calculates the "diff" value in the variance algorithm by
  *  (src - ref).
  */
-static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
-                             int l2h, int src_stride, int ref_stride,
-                             uint32_t *sse_ptr, bool use_high_bit_depth_,
-                             vpx_bit_depth_t bit_depth) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
+static void variance(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int w, int h, bool use_high_bit_depth_,
+                     uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) {
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      int diff;
+      int diff = 0;
       if (!use_high_bit_depth_) {
         diff = src[y * src_stride + x] - ref[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
                CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
+      se_long += diff;
+      sse_long += diff * diff;
     }
   }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
+
+  RoundHighBitDepth(bit_depth, &se_long, &sse_long);
+
+  *sse = sse_long;
+  *se = se_long;
+}
+
+static void get_variance_ref(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int l2w,
+                             int l2h, bool use_high_bit_depth_, uint32_t *sse,
+                             int *se, vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse = static_cast<uint32_t>(sse_long);
+  *se = static_cast<int>(se_long);
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
+                             vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse_ptr = static_cast<uint32_t>(sse_long);
   return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+      sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h)));
 }
 
 /* The subpel reference functions differ from the codec version in one aspect:
@@ -337,6 +370,9 @@ class MainTestClass
   void OneQuarterTest();
   void SpeedTest();
 
+  // GetVariance tests
+  void RefTestGetVar();
+
   // MSE/SSE tests
   void RefTestMse();
   void RefTestSse();
@@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Tests related to GetVariance.
+template <typename GetVarianceFunctionType>
+void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+    unsigned int sse1, sse2;
+    int sum1, sum2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(src_, stride, ref_, stride, &sse1, &sum1));
+    get_variance_ref(src_, stride, ref_, stride, params_.log2width,
+                     params_.log2height, use_high_bit_depth(), &sse2, &sum2,
+                     params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(sum1, sum2) << "Error at test index: " << i;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
 template <typename FunctionType>
@@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
 typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
 typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
 typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest;
 typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
 typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
 
@@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
+typedef TestParams<GetVarianceFunc> GetVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c)));
+
 typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
 INSTANTIATE_TEST_SUITE_P(
     C, VpxSubpelVarianceTest,
@@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest;
 typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
     VpxHBDSubpelAvgVarianceTest;
@@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    C, VpxHBDGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12),
+                      GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12),
+                      GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10),
+                      GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10),
+                      GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8),
+                      GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
@@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_variance4x4_sse2)));
 
 INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
@@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P(
         VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
@@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_neon),
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon)));
+
 #if HAVE_NEON_DOTPROD
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxSseTest,
@@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
                       VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
                       VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod)));
 #endif  // HAVE_NEON_DOTPROD
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
 
-// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
-// be used again.
-#if 0
 #if HAVE_NEON_DOTPROD
 INSTANTIATE_TEST_SUITE_P(
     NEON_DOTPROD, VpxHBDMseTest,
@@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P(
         MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
         MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
 #endif  // HAVE_NEON_DOTPROD
-#endif  // 0
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10)));
+#endif  // HAVE_SVE
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
@@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P(
         VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
 
 INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8)));
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8)));
+#endif  // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
@@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
 #if HAVE_MSA
 INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
                          ::testing::Values(vpx_get_mb_ss_msa));
@@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 2, &vpx_variance4x4_msa)));
 
 INSTANTIATE_TEST_SUITE_P(
+    MSA, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
     MSA, VpxSubpelVarianceTest,
     ::testing::Values(
         SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
@@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(3, 2, &vpx_variance8x4_vsx),
                       VarianceParams(2, 3, &vpx_variance4x8_vsx),
                       VarianceParams(2, 2, &vpx_variance4x4_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx)));
 #endif  // HAVE_VSX
 
 #if HAVE_MMI
diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h
index 2194126f1f..2c035910db 100644
--- a/media/libvpx/libvpx/test/video_source.h
+++ b/media/libvpx/libvpx/test/video_source.h
@@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource {
   RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
       : rnd_(seed), seed_(seed) {}
 
- protected:
   // Reset the RNG to get a matching stream for the second pass
   void Begin() override {
     frame_ = 0;
@@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource {
     FillFrame();
   }
 
+ protected:
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
   void FillFrame() override {
diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc
index aee27af66e..d47ed298fe 100644
--- a/media/libvpx/libvpx/test/vp8_datarate_test.cc
+++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc
@@ -14,7 +14,7 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 
 namespace {
 
@@ -260,6 +260,27 @@ class DatarateTestLarge
         << " The datarate for the file missed the target!";
   }
 
+  virtual void MultiThreadsPSNRTest() {
+    denoiser_on_ = 0;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_threads = 4;
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, 30);
+    cfg_.rc_target_bitrate = 1000;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0)
+        << " The datarate for the file missed the target!";
+  }
+
   vpx_codec_pts_t last_pts_;
   int64_t bits_in_buffer_model_;
   double timebase_;
@@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
   DropFramesMultiThreadsTest();
 }
 
+TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); }
+
 TEST_P(DatarateTestRealTime, RegionOfInterest) {
   denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
index 50478f7635..d87fef5a46 100644
--- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -149,9 +149,16 @@ class Vp8RcInterfaceTest
       return;
     }
     int qp;
+    libvpx::UVDeltaQP uv_delta_qp;
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
     if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
       ASSERT_EQ(rc_api_->GetQP(), qp);
+      uv_delta_qp = rc_api_->GetUVDeltaQP();
+      // delta_qp for UV channel is only set for screen.
+      if (!rc_cfg_.is_screen) {
+        ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0);
+        ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0);
+      }
     } else {
       num_drops_++;
     }
diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc
index 0645341ac1..c5ddcd58ab 100644
--- a/media/libvpx/libvpx/test/vp9_block_error_test.cc
+++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc
@@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = {
 INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
                          ::testing::ValuesIn(neon_block_error_tests));
 #endif  // HAVE_NEON
+
+#if HAVE_SVE
+const BlockErrorParam sve_block_error_tests[] = { make_tuple(
+    &BlockError8BitWrapper<vp9_block_error_sve>,
+    &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) };
+
+INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest,
+                         ::testing::ValuesIn(sve_block_error_tests));
+#endif  // HAVE_SVE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
index 33fa05c65c..5c23a5b0d5 100644
--- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -10,115 +10,78 @@
 
 #include <cstdint>
 #include <new>
+#include <memory>
+
+#include "./vpx_config.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/yuv_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#if CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
 #include "vp9/simple_encode.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_image.h"
 #include "vpx/vpx_tpl.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
 
-constexpr int kModelMagicNumber = 51396;
-constexpr uintptr_t PrivMagicNumber = 5566;
-constexpr int kFrameNum = 5;
-constexpr int kFrameNumGOP = 30;
-constexpr int kFrameNumGOPShort = 4;
-constexpr int kLosslessCodingIndex = 2;
-constexpr int kFixedGOPSize = 9;
-// The range check in vp9_cx_iface.c shows that the max
-// lag in buffer is MAX_LAG_BUFFERS (25):
-// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
-constexpr int kMaxLagInFrames = 25;
-constexpr int kDefaultMinGfInterval = 4;
-constexpr int kDefaultMaxGfInterval = 16;
-// The active gf interval might change for each GOP
-// See function "get_active_gf_inverval_range".
-// The numbers below are from manual inspection.
-constexpr int kReadMinGfInterval = 5;
-constexpr int kReadMaxGfInterval = 13;
-const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.4;
-
-struct ToyRateCtrl {
-  int magic_number;
-  int coding_index;
-
-  int gop_global_index;
-  int frames_since_key;
-  int show_index;
+constexpr int kFrameNum = 10;
+constexpr int kFixedGOPSize = 10;
+constexpr int kKeyframeQp = 10;
+constexpr int kLeafQp = 40;
+constexpr int kArfQp = 15;
+
+// Simple external rate controller for testing.
+class RateControllerForTest {
+ public:
+  RateControllerForTest() : current_gop_(-1) {}
+  ~RateControllerForTest() {}
+
+  void StartNextGop() { ++current_gop_; }
+
+  vpx_rc_gop_decision_t GetCurrentGop() const {
+    vpx_rc_gop_decision_t gop_decision;
+    gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0;
+    gop_decision.use_alt_ref = 1;
+    gop_decision.gop_coding_frames = kFixedGOPSize;
+    return gop_decision;
+  }
+
+  int CalculateFrameDecision(int frame_index) {
+    EXPECT_LE(frame_index, kFixedGOPSize);
+    if (current_gop_ == 0 && frame_index == 0) {
+      // Key frame, first frame in the first GOP.
+      return kKeyframeQp;
+    } else if (frame_index == 1) {
+      // ARF, we always use ARF for this test.
+      return kArfQp;
+    } else {
+      return kLeafQp;
+    }
+  }
+  int current_gop_;
 };
 
-vpx_rc_status_t rc_create_model(void *priv,
-                                const vpx_rc_config_t *ratectrl_config,
-                                vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->coding_index = -1;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 352);
-  EXPECT_EQ(ratectrl_config->frame_height, 288);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop(void *priv,
-                                    const vpx_rc_config_t *ratectrl_config,
-                                    vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_global_index = 0;
-  toy_rate_ctrl->frames_since_key = 0;
-  toy_rate_ctrl->show_index = 0;
-  toy_rate_ctrl->coding_index = 0;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 640);
-  EXPECT_EQ(ratectrl_config->frame_height, 360);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop_short(
-    void *priv, const vpx_rc_config_t *ratectrl_config,
+// Callbacks used in this test.
+vpx_rc_status_t rc_test_create_model(
+    void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/,
     vpx_rc_model_t *rate_ctrl_model_ptr) {
-  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
-  toy_rate_ctrl->magic_number = kModelMagicNumber;
-  toy_rate_ctrl->gop_global_index = 0;
-  toy_rate_ctrl->frames_since_key = 0;
-  toy_rate_ctrl->show_index = 0;
-  toy_rate_ctrl->coding_index = 0;
-  *rate_ctrl_model_ptr = toy_rate_ctrl;
-  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
-  EXPECT_EQ(ratectrl_config->frame_width, 352);
-  EXPECT_EQ(ratectrl_config->frame_height, 288);
-  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
-  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
-  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
-  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  std::unique_ptr<RateControllerForTest> test_controller(
+      new RateControllerForTest());
+  *rate_ctrl_model_ptr = test_controller.release();
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_send_firstpass_stats(
-    vpx_rc_model_t rate_ctrl_model,
+vpx_rc_status_t rc_test_send_firstpass_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/,
     const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
   EXPECT_EQ(first_pass_stats->num_frames, kFrameNum);
   for (int i = 0; i < first_pass_stats->num_frames; ++i) {
     EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
@@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats(
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_send_firstpass_stats_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
-  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
-    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_firstpass_stats_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_firstpass_stats_t *first_pass_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
-  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
-    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
-                                      const VpxTplGopStats *tpl_gop_stats) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+vpx_rc_status_t rc_test_send_tpl_gop_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) {
   EXPECT_GT(tpl_gop_stats->size, 0);
 
   for (int i = 0; i < tpl_gop_stats->size; ++i) {
@@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_get_encodeframe_decision(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
+vpx_rc_status_t rc_test_get_encodeframe_decision(
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
     vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  toy_rate_ctrl->coding_index += 1;
-
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  EXPECT_LT(encode_frame_info->show_index, kFrameNum);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 4);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-  } else if (encode_frame_info->coding_index >= 2 &&
-             encode_frame_info->coding_index < 5) {
-    // In the first group of pictures, coding_index and gop_index are equal.
-    EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-  } else if (encode_frame_info->coding_index == 5) {
-    EXPECT_EQ(encode_frame_info->show_index, 4);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              1);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              1);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              4);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2],
-              1);  // kRefFrameTypeFuture
-  }
-  if (encode_frame_info->coding_index == kLosslessCodingIndex) {
-    // We should get sse == 0 at rc_update_encodeframe_result()
-    frame_decision->q_index = 0;
-  } else {
-    frame_decision->q_index = 100;
-  }
-  frame_decision->max_frame_size = 0;
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  frame_decision->q_index =
+      test_controller->CalculateFrameDecision(frame_gop_index);
   return VPX_RC_OK;
 }
 
-vpx_rc_status_t rc_get_encodeframe_decision_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-  } else if (encode_frame_info->coding_index == 3 ||
-             encode_frame_info->coding_index == 12 ||
-             encode_frame_info->coding_index == 21) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-  } else if (encode_frame_info->coding_index == 11 ||
-             encode_frame_info->coding_index == 20 ||
-             encode_frame_info->coding_index == 29) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-  } else if (encode_frame_info->coding_index >= 30) {
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 3);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 4) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
-    vpx_rc_encodeframe_decision_t *frame_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  if (encode_frame_info->coding_index == 0) {
-    EXPECT_EQ(encode_frame_info->show_index, 0);
-    EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 1) {
-    EXPECT_EQ(encode_frame_info->show_index, 1);
-    EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
-              1);  // kRefFrameTypeLast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
-              0);  // kRefFrameTypePast
-    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
-              0);  // kRefFrameTypeFuture
-    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
-              0);  // kRefFrameTypeLast
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 2) {
-    EXPECT_EQ(encode_frame_info->show_index, 2);
-    EXPECT_EQ(encode_frame_info->gop_index, 2);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  } else if (encode_frame_info->coding_index == 3) {
-    EXPECT_EQ(encode_frame_info->show_index, 3);
-    EXPECT_EQ(encode_frame_info->gop_index, 3);
-    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
-  }
-
-  // When the model recommends an invalid q, valid range [0, 255],
-  // the encoder will ignore it and use the default q selected
-  // by libvpx rate control strategy.
-  frame_decision->q_index = VPX_DEFAULT_Q;
-  frame_decision->max_frame_size = 0;
-
-  toy_rate_ctrl->coding_index += 1;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
-                                    const vpx_rc_gop_info_t *gop_info,
-                                    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
-  EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames =
-      VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
-  gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 3 coding frames, no alt ref.
-// The second GOP has 1 coding frame, no alt ref.
-vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
-                                          const vpx_rc_gop_info_t *gop_info,
-                                          vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
-  gop_decision->use_alt_ref = 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 4 coding frames. Use alt ref.
-// The second GOP only contains the overlay frame of the first GOP's alt ref
-// frame.
-vpx_rc_status_t rc_get_gop_decision_short_overlay(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
-  gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 1 GOP.
-// The GOP has 4 coding frames. Do not use alt ref.
-vpx_rc_status_t rc_get_gop_decision_short_no_arf(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
-  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
-  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
-  EXPECT_EQ(gop_info->allow_alt_ref, 1);
-  if (gop_info->is_key_frame) {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-    EXPECT_EQ(gop_info->frames_since_key, 0);
-    EXPECT_EQ(gop_info->gop_global_index, 0);
-    toy_rate_ctrl->gop_global_index = 0;
-    toy_rate_ctrl->frames_since_key = 0;
-  } else {
-    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
-  }
-  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
-  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
-  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
-  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
-  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
-  gop_decision->use_alt_ref = 0;
-  toy_rate_ctrl->frames_since_key +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  toy_rate_ctrl->show_index +=
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
-  ++toy_rate_ctrl->gop_global_index;
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
-    EXPECT_EQ(encode_frame_result->sse, 0);
-  }
-  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
-    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
-  } else {
-    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
-  }
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop_short(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_result_t *encode_frame_result) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
-  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
-  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
-  return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_default_frame_rdmult(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
-  const ToyRateCtrl *toy_rate_ctrl =
-      static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
-  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
-  *rdmult = VPX_DEFAULT_RDMULT;
+vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                         vpx_rc_gop_decision_t *gop_decision) {
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  test_controller->StartNextGop();
+  *gop_decision = test_controller->GetCurrentGop();
   return VPX_RC_OK;
 }
 
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
-  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
-  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-  delete toy_rate_ctrl;
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  delete test_controller;
   return VPX_RC_OK;
 }
 
 class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                         public ::testing::Test {
  protected:
-  ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
+  ExtRateCtrlTest()
+      : EncoderTest(&::libvpx_test::kVP9), frame_number_(0),
+        current_frame_qp_(0) {}
 
   ~ExtRateCtrlTest() override = default;
 
@@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_QP;
-      rc_funcs.create_model = rc_create_model;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
-      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTest, EncodeTest) {
-  cfg_.rc_target_bitrate = 24000;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
-      kFrameNum));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
-                           public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOP() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
-      rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
-      rc_funcs.get_gop_decision = rc_get_gop_decision;
-      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
-  cfg_.rc_target_bitrate = 4000;
-  cfg_.g_lag_in_frames = kMaxLagInFrames;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
-      kFrameNumGOP));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
-                                public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShort() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
-    }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortOverlay
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShortOverlay() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision =
-          rc_get_encodeframe_decision_gop_short_overlay;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
+      rc_funcs.create_model = rc_test_create_model;
+      rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats;
+      rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats;
+      rc_funcs.get_gop_decision = rc_test_get_gop_decision;
+      rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision;
       rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
       encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
     }
   }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortNoARF
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWithParam<int> {
- protected:
-  ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestGOPShortNoARF() override = default;
 
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
+#if CONFIG_VP9_DECODER
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const ::libvpx_test::VideoSource & /*video*/,
+                          ::libvpx_test::Decoder *decoder) override {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    decoder->Control(VPXD_GET_LAST_QUANTIZER, &current_frame_qp_);
+    return VPX_CODEC_OK == res_dec;
   }
 
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
-      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision =
-          rc_get_encodeframe_decision_gop_short_no_arf;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    if (frame_number_ == 0) {
+      // This must be a key frame
+      EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0);
+      EXPECT_EQ(current_frame_qp_, kKeyframeQp);
+      ++frame_number_;
+      return;
     }
-  }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  std::unique_ptr<libvpx_test::VideoSource> video;
-  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
-  ASSERT_NE(video, nullptr);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
-                              public ::testing::Test {
- protected:
-  ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
-
-  ~ExtRateCtrlTestRdmult() override = default;
-
-  void SetUp() override {
-    InitializeConfig();
-    SetMode(::libvpx_test::kTwoPassGood);
-  }
-
-  void BeginPassHook(unsigned int) override {
-    psnr_ = 0.0;
-    nframes_ = 0;
-  }
-
-  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
-    psnr_ += pkt->data.psnr.psnr[0];
-    nframes_++;
-  }
 
-  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                          ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs = {};
-      rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
-      rc_funcs.create_model = rc_create_model_gop_short;
-      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
-      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
-      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
-      rc_funcs.update_encodeframe_result =
-          rc_update_encodeframe_result_gop_short;
-      rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
-      rc_funcs.delete_model = rc_delete_model;
-      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
-      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) {
+      // This is ARF
+      EXPECT_EQ(current_frame_qp_, kArfQp);
+      ++frame_number_;
+      return;
     }
-  }
 
-  double GetAveragePsnr() const {
-    if (nframes_) return psnr_ / nframes_;
-    return 0.0;
+    EXPECT_EQ(current_frame_qp_, kLeafQp);
+    ++frame_number_;
   }
+#endif  // CONFIG_VP9_DECODER
 
- private:
-  double psnr_;
-  unsigned int nframes_;
+  int frame_number_;
+  int current_frame_qp_;
 };
 
-TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
-  cfg_.rc_end_usage = VPX_VBR;
-  init_flags_ = VPX_CODEC_USE_PSNR;
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = 25;
 
   std::unique_ptr<libvpx_test::VideoSource> video;
   video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
-      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+      kFrameNum));
 
   ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-
-  const double psnr = GetAveragePsnr();
-  EXPECT_GT(psnr, kPsnrThreshold);
 }
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
index f7be47542c..a6c7563348 100644
--- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -9,6 +9,7 @@
  */
 #include "vp9/ratectrl_rtc.h"
 
+#include <climits>
 #include <fstream>  // NOLINT
 #include <string>
 
@@ -19,6 +20,8 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/video_source.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_ports/bitops.h"
 
diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc
index 049a10a617..a5a18a7e9d 100644
--- a/media/libvpx/libvpx/test/vp9_scale_test.cc
+++ b/media/libvpx/libvpx/test/vp9_scale_test.cc
@@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase,
   }
 
   void RunTest(INTERP_FILTER filter_type) {
-    static const int kNumSizesToTest = 20;
+    static const int kNumSizesToTest = 22;
     static const int kNumScaleFactorsToTest = 4;
-    static const int kSizesToTest[] = {
-      2,  4,  6,  8,  10, 12, 14, 16, 18,  20,
-      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
-    };
+    static const int kSizesToTest[] = { 1,  2,  3,  4,  6,   8,  10, 12,
+                                        14, 16, 18, 20, 22,  24, 26, 28,
+                                        30, 32, 34, 68, 128, 134 };
     static const int kScaleFactors[] = { 1, 2, 3, 4 };
     for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
       for (int h = 0; h < kNumSizesToTest; ++h) {
diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c
index 5c13781513..5af971f720 100644
--- a/media/libvpx/libvpx/tools_common.c
+++ b/media/libvpx/libvpx/tools_common.c
@@ -26,15 +26,9 @@
 
 #include "vpx/vpx_codec.h"
 
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
 #include <io.h>
 #include <fcntl.h>
-
-#ifdef __OS2__
-#define _setmode setmode
-#define _fileno fileno
-#define _O_BINARY O_BINARY
-#endif
 #endif
 
 #define LOG_ERROR(label)               \
@@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
 
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
   _setmode(_fileno(stream), _O_BINARY);
 #endif
   return stream;
@@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
     int w = vpx_img_plane_width(yuv_frame, plane);
     const int h = vpx_img_plane_height(yuv_frame, plane);
     int r;
-    // Assuming that for nv12 we read all chroma data at one time
+    // Assuming that for nv12 we read all chroma data at once
     if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
-    // Fixing NV12 chroma width it is odd
+    // Fixing NV12 chroma width if it is odd
     if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
     /* Determine the correct plane based on the image format. The for-loop
      * always counts in Y,U,V order, but this may not match the order of
@@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) {
 
 void vpx_img_write(const vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we write all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytespp, w, file);
       buf += stride;
     }
   }
@@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
 
 int vpx_img_read(vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we read all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      if (fread(buf, bytespp, w, file) != (size_t)w) return 0;
       buf += stride;
     }
   }
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index ee3c281f0f..a54e81084b 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positions are */
+  { 0, 0, -128, 0, 0, 0, 0, 0 },    /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
index fc4a3539fd..b9efc0cc1f 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
       p[0] = p[1] = 0;
     }
 
-    void init_bit_trees() {
+    void init_bit_trees(void) {
       init_bit_tree(cat1, 1);
       init_bit_tree(cat2, 2);
       init_bit_tree(cat3, 3);
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
index 71529bdfd8..7c8e083f4f 100644
--- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -25,23 +25,19 @@
 #include "vp8/common/systemdependent.h"
 
 #if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
-#elif defined(__OS2__)
-#define INCL_DOS
-#define INCL_DOSSPINLOCK
-#include <os2.h>
 #endif
 #endif
 
 #if CONFIG_MULTITHREAD
-static int get_cpu_count() {
+static int get_cpu_count(void) {
   int core_count = 16;
 
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #if defined(_SC_NPROCESSORS_ONLN)
   core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
@@ -49,38 +45,13 @@ static int get_cpu_count() {
 #endif
 #elif defined(_WIN32)
   {
-#if _WIN32_WINNT >= 0x0501
+#if _WIN32_WINNT < 0x0501
+#error _WIN32_WINNT must target Windows XP or newer.
+#endif
     SYSTEM_INFO sysinfo;
     GetNativeSystemInfo(&sysinfo);
-#else
-    PGNSI pGNSI;
-    SYSTEM_INFO sysinfo;
-
-    /* Call GetNativeSystemInfo if supported or
-     * GetSystemInfo otherwise. */
-
-    pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
-                                  "GetNativeSystemInfo");
-    if (pGNSI != NULL)
-      pGNSI(&sysinfo);
-    else
-      GetSystemInfo(&sysinfo);
-#endif
-
     core_count = (int)sysinfo.dwNumberOfProcessors;
   }
-#elif defined(__OS2__)
-  {
-    ULONG proc_id;
-    ULONG status;
-
-    core_count = 0;
-    for (proc_id = 1;; ++proc_id) {
-      if (DosGetProcessorStatus(proc_id, &status)) break;
-
-      if (status == PROC_ONLINE) core_count++;
-    }
-  }
 #else
 /* other platforms */
 #endif
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
index 1b70ea5dba..2038c000b0 100644
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -242,7 +242,7 @@ typedef struct {
 #endif
 } VP8_CONFIG;
 
-void vp8_initialize();
+void vp8_initialize(void);
 
 struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
index 09a0e2b4b3..102b7ccd54 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd.c
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -12,4 +12,4 @@
 #include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp8_rtcd() { once(setup_rtcd_internal); }
+void vp8_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
index 1cfb9fec51..0de75cfde3 100644
--- a/media/libvpx/libvpx/vp8/common/threading.h
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -19,161 +19,57 @@ extern "C" {
 
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
-/* Thread management macros */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
 /* Win32 */
-#include <process.h>
 #include <windows.h>
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREAD_FUNCTION \
-  __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREAD_FUNCTION unsigned int __stdcall
-#endif
-#define THREAD_FUNCTION_RETURN DWORD
-#define THREAD_SPECIFIC_INDEX DWORD
-#define pthread_t HANDLE
-#define pthread_attr_t DWORD
-#define pthread_detach(thread) \
-  if (thread != NULL) CloseHandle(thread)
-#define thread_sleep(nms) Sleep(nms)
-#define pthread_cancel(thread) terminate_thread(thread, 0)
-#define ts_key_create(ts_key, destructor) \
-  { ts_key = TlsAlloc(); };
-#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
-#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
-#define pthread_self() GetCurrentThreadId()
-
-#elif defined(__OS2__)
-/* OS/2 */
-#define INCL_DOS
-#include <os2.h>
-
-#include <stdlib.h>
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX PULONG
-#define pthread_t TID
-#define pthread_attr_t ULONG
-#define pthread_detach(thread) 0
-#define thread_sleep(nms) DosSleep(nms)
-#define pthread_cancel(thread) DosKillThread(thread)
-#define ts_key_create(ts_key, destructor) \
-  DosAllocThreadLocalMemory(1, &(ts_key));
-#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
-#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
-#define pthread_self() _gettid()
 #else
+/* pthreads */
 #ifdef __APPLE__
 #include <mach/mach_init.h>
 #include <mach/semaphore.h>
 #include <mach/task.h>
 #include <time.h>
 #include <unistd.h>
-
 #else
 #include <semaphore.h>
 #endif
-
-#include <pthread.h>
-/* pthreads */
-/* Nearly everything is already defined */
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX pthread_key_t
-#define ts_key_create(ts_key, destructor) \
-  pthread_key_create(&(ts_key), destructor);
 #endif
 
 /* Synchronization macros: Win32 and Pthreads */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
-#define sem_t HANDLE
-#define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) \
-  (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
-#define sem_wait(sem) \
+#define vp8_sem_t HANDLE
+#define vp8_sem_init(sem, pshared, value) \
+  (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL)
+#define vp8_sem_wait(sem) \
   (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
-#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
-#define sem_destroy(sem) \
+#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define vp8_sem_destroy(sem) \
   if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
 #define thread_sleep(nms) Sleep(nms)
 
-#elif defined(__OS2__)
-typedef struct {
-  HEV event;
-  HMTX wait_mutex;
-  HMTX count_mutex;
-  int count;
-} sem_t;
-
-static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
-  DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
-                    value > 0 ? TRUE : FALSE);
-  DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
-  DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
-
-  sem->count = value;
-
-  return 0;
-}
-
-static inline int sem_wait(sem_t *sem) {
-  DosRequestMutexSem(sem->wait_mutex, -1);
-
-  DosWaitEventSem(sem->event, -1);
-
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  sem->count--;
-  if (sem->count == 0) {
-    ULONG post_count;
-
-    DosResetEventSem(sem->event, &post_count);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  DosReleaseMutexSem(sem->wait_mutex);
-
-  return 0;
-}
-
-static inline int sem_post(sem_t *sem) {
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  if (sem->count < 32768) {
-    sem->count++;
-    DosPostEventSem(sem->event);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-static inline int sem_destroy(sem_t *sem) {
-  DosCloseEventSem(sem->event);
-  DosCloseMutexSem(sem->wait_mutex);
-  DosCloseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-#define thread_sleep(nms) DosSleep(nms)
-
 #else
 
 #ifdef __APPLE__
-#define sem_t semaphore_t
-#define sem_init(X, Y, Z) \
-  semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
-#define sem_wait(sem) (semaphore_wait(*sem))
-#define sem_post(sem) semaphore_signal(*sem)
-#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
+#define vp8_sem_t semaphore_t
+#define vp8_sem_init(sem, pshared, value) \
+  semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value)
+#define vp8_sem_wait(sem) semaphore_wait(*sem)
+#define vp8_sem_post(sem) semaphore_signal(*sem)
+#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
 #else
+#include <errno.h>
 #include <unistd.h>
 #include <sched.h>
+#define vp8_sem_t sem_t
+#define vp8_sem_init sem_init
+static INLINE int vp8_sem_wait(vp8_sem_t *sem) {
+  int ret;
+  while ((ret = sem_wait(sem)) == -1 && errno == EINTR) {
+  }
+  return ret;
+}
+#define vp8_sem_post sem_post
+#define vp8_sem_destroy sem_destroy
 #endif /* __APPLE__ */
 /* Not Windows. Assume pthreads */
 
@@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) {
 #define x86_pause_hint()
 #endif
 
-#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_atomics.h"
 
 static INLINE void vp8_atomic_spin_wait(
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
index 2248345ba2..88f2de024b 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
@@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
 
 #if CONFIG_MULTITHREAD
   if (setjmp(fb->pbi[0]->common.error.jmp)) {
+    fb->pbi[0]->common.error.setjmp = 0;
     vp8_remove_decoder_instances(fb);
     vp8_zero(fb->pbi);
     vpx_clear_system_state();
@@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
 
   /* decoder instance for single thread mode */
   remove_decompressor(pbi);
+  fb->pbi[0] = NULL;
   return VPX_CODEC_OK;
 }
 
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
index 1070849620..08a60b31b9 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "vpx_config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
@@ -94,8 +95,8 @@ typedef struct VP8D_COMP {
   DECODETHREAD_DATA *de_thread_data;
 
   pthread_t *h_decoding_thread;
-  sem_t *h_event_start_decoding;
-  sem_t h_event_end_decoding;
+  vp8_sem_t *h_event_start_decoding;
+  vp8_sem_t h_event_end_decoding;
 /* end of threading data */
 #endif
 
diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c
index 6ccb080cf9..d16284d134 100644
--- a/media/libvpx/libvpx/vp8/decoder/threading.c
+++ b/media/libvpx/libvpx/vp8/decoder/threading.c
@@ -15,6 +15,7 @@
 #endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/common.h"
 #include "vp8/common/threading.h"
 #include "vp8/common/loopfilter.h"
@@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   /* signal end of decoding of current thread for current frame */
   if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
-    sem_post(&pbi->h_event_end_decoding);
+    vp8_sem_post(&pbi->h_event_end_decoding);
 }
 
-static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
+static THREADFN thread_decoding_proc(void *p_data) {
   int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
   VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
   while (1) {
     if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
 
-    if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
+    if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
       if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
@@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
         if (setjmp(xd->error_info.jmp)) {
           xd->error_info.setjmp = 0;
           // Signal the end of decoding for current thread.
-          sem_post(&pbi->h_event_end_decoding);
+          vp8_sem_post(&pbi->h_event_end_decoding);
           continue;
         }
         xd->error_info.setjmp = 1;
         mt_decode_mb_rows(pbi, xd, ithread + 1);
+        xd->error_info.setjmp = 0;
       }
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 void vp8_decoder_create_threads(VP8D_COMP *pbi) {
@@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
     CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
     CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
 
-    if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
+    if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) {
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize semaphore");
     }
 
     for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
-      if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
+      if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
 
       vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
 
@@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 
       if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
                          thread_decoding_proc, &pbi->de_thread_data[ithread])) {
-        sem_destroy(&pbi->h_event_start_decoding[ithread]);
+        vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]);
         break;
       }
     }
@@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
       /* the remainder of cleanup cases will be handled in
        * vp8_decoder_remove_threads(). */
       if (pbi->allocated_decoding_thread_count == 0) {
-        sem_destroy(&pbi->h_event_end_decoding);
+        vp8_sem_destroy(&pbi->h_event_end_decoding);
       }
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to create threads");
@@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_post(&pbi->h_event_start_decoding[i]);
+      vp8_sem_post(&pbi->h_event_start_decoding[i]);
       pthread_join(pbi->h_decoding_thread[i], NULL);
     }
 
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_destroy(&pbi->h_event_start_decoding[i]);
+      vp8_sem_destroy(&pbi->h_event_start_decoding[i]);
     }
 
     if (pbi->allocated_decoding_thread_count) {
-      sem_destroy(&pbi->h_event_end_decoding);
+      vp8_sem_destroy(&pbi->h_event_end_decoding);
     }
 
     vpx_free(pbi->h_decoding_thread);
@@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
                              pbi->decoding_thread_count);
 
   for (i = 0; i < pbi->decoding_thread_count; ++i) {
-    sem_post(&pbi->h_event_start_decoding[i]);
+    vp8_sem_post(&pbi->h_event_start_decoding[i]);
   }
 
   if (setjmp(xd->error_info.jmp)) {
@@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     // the current frame while the main thread starts decoding the next frame,
     // which causes a data race.
     for (i = 0; i < pbi->decoding_thread_count; ++i)
-      sem_wait(&pbi->h_event_end_decoding);
+      vp8_sem_wait(&pbi->h_event_end_decoding);
     return -1;
   }
 
   xd->error_info.setjmp = 1;
   mt_decode_mb_rows(pbi, xd, 0);
+  xd->error_info.setjmp = 0;
 
   for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
-    sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+    vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
 
   return 0;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
index 82c48b13a7..d0117897db 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
@@ -7,38 +7,38 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
 #include <limits.h>
+#include <stdio.h>
 
 #include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "bitstream.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#if CONFIG_MULTITHREAD
-#include "ethreading.h"
-#endif
+
 #include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
 #include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "rdopt.h"
-#include "pickinter.h"
+#include "vp8/common/extend.h"
 #include "vp8/common/findnearmv.h"
 #include "vp8/common/invtrans.h"
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/threading.h"
+#include "vp8/encoder/bitstream.h"
+#include "vp8/encoder/encodeframe.h"
+#include "vp8/encoder/encodeintra.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/encoder/encodemv.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/pickinter.h"
+#include "vp8/encoder/rdopt.h"
+#include "vp8/encoder/segmentation.h"
+#include "vp8_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
-#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-#include "bitstream.h"
+
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
 #endif
-#include "encodeframe.h"
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
@@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
         vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows;
@@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       }
       /* Wait for all the threads to finish. */
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_wait(&cpi->h_event_end_encoding[i]);
+        vp8_sem_wait(&cpi->h_event_end_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c
index e2f8b89d46..98c87d3cbc 100644
--- a/media/libvpx/libvpx/vp8/encoder/ethreading.c
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c
@@ -10,6 +10,7 @@
 #include <stddef.h>
 
 #include "onyx_int.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/threading.h"
 #include "vp8/common/common.h"
 #include "vp8/common/extend.h"
@@ -22,27 +23,27 @@
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x,
                                     int ok_to_skip);
 
-static THREAD_FUNCTION thread_loopfilter(void *p_data) {
+static THREADFN thread_loopfilter(void *p_data) {
   VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
     if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_lpf) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
       if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
-      sem_post(&cpi->h_event_end_lpf);
+      vp8_sem_post(&cpi->h_event_end_lpf);
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
-static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
+static THREADFN thread_encoding_proc(void *p_data) {
   int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
   VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
   while (1) {
     if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
       VP8_COMMON *cm = &cpi->common;
       int mb_row;
@@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
       }
       /* Signal that this thread has completed processing its rows. */
-      sem_post(&cpi->h_event_end_encoding[ithread]);
+      vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
     }
   }
 
   /* printf("exit thread %d\n", ithread); */
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
@@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
     CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
@@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
       vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
 
-      sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
-      sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
 
       ethd->ithread = ithread;
       ethd->ptr1 = (void *)cpi;
@@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       /* shutdown other threads */
       vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
-        sem_post(&cpi->h_event_start_encoding[ithread]);
-        sem_post(&cpi->h_event_end_encoding[ithread]);
+        vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
-        sem_destroy(&cpi->h_event_start_encoding[ithread]);
-        sem_destroy(&cpi->h_event_end_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
       }
 
       /* free thread related resources */
@@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     {
       LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data;
 
-      sem_init(&cpi->h_event_start_lpf, 0, 0);
-      sem_init(&cpi->h_event_end_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_start_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_end_lpf, 0, 0);
 
       lpfthd->ptr1 = (void *)cpi;
       rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
@@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
         /* shutdown other threads */
         vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
-          sem_post(&cpi->h_event_start_encoding[ithread]);
-          sem_post(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
           pthread_join(cpi->h_encoding_thread[ithread], 0);
-          sem_destroy(&cpi->h_event_start_encoding[ithread]);
-          sem_destroy(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
         }
-        sem_destroy(&cpi->h_event_end_lpf);
-        sem_destroy(&cpi->h_event_start_lpf);
+        vp8_sem_destroy(&cpi->h_event_end_lpf);
+        vp8_sem_destroy(&cpi->h_event_start_lpf);
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
@@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
       int i;
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
-        sem_post(&cpi->h_event_end_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_end_encoding[i]);
 
         pthread_join(cpi->h_encoding_thread[i], 0);
 
-        sem_destroy(&cpi->h_event_start_encoding[i]);
-        sem_destroy(&cpi->h_event_end_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[i]);
       }
 
-      sem_post(&cpi->h_event_start_lpf);
+      vp8_sem_post(&cpi->h_event_start_lpf);
       pthread_join(cpi->h_filter_thread, 0);
     }
 
-    sem_destroy(&cpi->h_event_end_lpf);
-    sem_destroy(&cpi->h_event_start_lpf);
+    vp8_sem_destroy(&cpi->h_event_end_lpf);
+    vp8_sem_destroy(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 0;
 
     /* free thread related resources */
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
index 4e128e3c49..ad01c6fc86 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@@ -63,7 +63,7 @@
 extern int vp8_update_coef_context(VP8_COMP *cpi);
 #endif
 
-extern unsigned int vp8_get_processor_freq();
+extern unsigned int vp8_get_processor_freq(void);
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
@@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) {
   int64_t llden = denom;
   int64_t llval = val;
 
-  return (int)(llval * llnum / llden);
+  int64_t result = (llval * llnum / llden);
+  if (result <= INT_MAX)
+    return (int)result;
+  else
+    return INT_MAX;
 }
 
 void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
@@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
-  lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+  if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000)
+    lc->target_bandwidth = INT_MAX;
+  else
+    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
 
   lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
   lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
@@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
       lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i];
-      lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+      if (oxcf->target_bitrate[i] > INT_MAX / 1000)
+        lc->target_bandwidth = INT_MAX;
+      else
+        lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
 
       lc->starting_buffer_level = rescale(
           (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000);
@@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
 
 #if CONFIG_MULTITHREAD
   if (vp8cx_create_encoder_threads(cpi)) {
+    cpi->common.error.setjmp = 0;
     vp8_remove_compressor(&cpi);
     return 0;
   }
@@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
 
   vp8_loop_filter_init(cm);
 
-  cpi->common.error.setjmp = 0;
-
 #if CONFIG_MULTI_RES_ENCODING
 
   /* Calculate # of MBs in a row in lower-resolution level image. */
@@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   vp8_setup_block_ptrs(&cpi->mb);
   vp8_setup_block_dptrs(&cpi->mb.e_mbd);
 
+  cpi->common.error.setjmp = 0;
+
   return cpi;
 }
 
@@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
 #if CONFIG_MULTITHREAD
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
-    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+    /* signal that we have set filter_level */
+    vp8_sem_post(&cpi->h_event_end_lpf);
   }
 #endif
 
@@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #if CONFIG_MULTITHREAD
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* start loopfilter in separate thread */
-    sem_post(&cpi->h_event_start_lpf);
+    vp8_sem_post(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 1;
     /* wait for the filter_level to be picked so that we can continue with
      * stream packing */
-    sem_wait(&cpi->h_event_end_lpf);
+    vp8_sem_wait(&cpi->h_event_end_lpf);
   } else
 #endif
   {
@@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
+#if CONFIG_MULTITHREAD
+  /* wait for the lpf thread done */
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
+    vp8_sem_wait(&cpi->h_event_end_lpf);
+    cpi->b_lpf_running = 0;
+  }
+#endif
+
   if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
     generate_psnr_packet(cpi);
   }
@@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 #endif
 #endif
 
-  cpi->common.error.setjmp = 0;
-
-#if CONFIG_MULTITHREAD
-  /* wait for the lpf thread done */
-  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
-    sem_wait(&cpi->h_event_end_lpf);
-    cpi->b_lpf_running = 0;
-  }
-#endif
-
   return 0;
 }
 
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
index 1451a27812..bb1518ed7f 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
@@ -20,6 +20,7 @@
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_pthread.h"
 #include "encodemb.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/entropy.h"
@@ -540,10 +541,10 @@ typedef struct VP8_COMP {
   LPFTHREAD_DATA lpf_thread_data;
 
   /* events */
-  sem_t *h_event_start_encoding;
-  sem_t *h_event_end_encoding;
-  sem_t h_event_start_lpf;
-  sem_t h_event_end_lpf;
+  vp8_sem_t *h_event_start_encoding;
+  vp8_sem_t *h_event_end_encoding;
+  vp8_sem_t h_event_start_lpf;
+  vp8_sem_t h_event_end_lpf;
 #endif
 
   TOKENLIST *tplist;
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
index fcd4eb04eb..7ba7a308ab 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
@@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
               (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
                     one_percent_bits);
         } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) {
-          percent_high =
-              (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8));
+          if (cpi->total_byte_count > 0) {
+            percent_high = (int)((100 * cpi->bits_off_target) /
+                                 (cpi->total_byte_count * 8));
+          } else {
+            percent_high = cpi->oxcf.over_shoot_pct;
+          }
         }
 
         if (percent_high > cpi->oxcf.over_shoot_pct) {
@@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     /* Calculate required scaling factor based on target frame size and
      * size of frame produced using previous Q
      */
-    if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) {
-      /* Case where we would overflow int */
-      target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs)
-                           << BPER_MB_NORMBITS;
+    if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) {
+      int temp = target_bits_per_frame / cpi->common.MBs;
+      if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+        target_bits_per_mb = INT_MAX;
+      } else {
+        target_bits_per_mb = temp << BPER_MB_NORMBITS;
+      }
     } else {
       target_bits_per_mb =
           (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       // undershoots significantly, and then we end up dropping every other
       // frame because the QP/rate_correction_factor may have been too low
       // before the drop and then takes too long to come up.
-      if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) {
-        target_bits_per_mb = (target_size / cpi->common.MBs)
-                             << BPER_MB_NORMBITS;
+      if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) {
+        int temp = target_size / cpi->common.MBs;
+        if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+          target_bits_per_mb = INT_MAX;
+        } else {
+          target_bits_per_mb = temp << BPER_MB_NORMBITS;
+        }
       } else {
         target_bits_per_mb =
             (target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h
index 47b5be17f1..5223aa2d86 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.h
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h
@@ -18,8 +18,6 @@
 extern "C" {
 #endif
 
-void vp8_tokenize_initialize();
-
 typedef struct {
   short Token;
   short Extra;
diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
index 1f16cc53d3..2b238c1a97 100644
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@@ -8,6 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -18,6 +23,7 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 #if CONFIG_MULTITHREAD
 #include "vp8/encoder/ethreading.h"
@@ -27,8 +33,6 @@
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
 #include "vp8/common/common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp8_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
@@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_profile, 3);
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
-  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
 #if CONFIG_REALTIME_ONLY
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
 #elif CONFIG_MULTI_RES_ENCODING
@@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
   set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
   vp8_change_config(ctx->cpi, &ctx->oxcf);
 #if CONFIG_MULTITHREAD
-  if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+  if (vp8cx_create_encoder_threads(ctx->cpi)) {
+    ctx->cpi->common.error.setjmp = 0;
+    return VPX_CODEC_ERROR;
+  }
 #endif
   ctx->cpi->common.error.setjmp = 0;
   return VPX_CODEC_OK;
@@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   return res;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
@@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
 
   if (deadline) {
     /* Convert duration parameter from stream timebase to microseconds */
-    uint64_t duration_us;
-
     VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
                       (TICKS_PER_SEC % 1000000) == 0);
 
-    duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
-                  (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+    if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) {
+      ERROR("duration is too big");
+    }
+    uint64_t duration_us =
+        duration * (uint64_t)ctx->timestamp_ratio.num /
+        ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
 
     /* If the deadline is more that the duration this frame is to be shown,
      * use good quality mode. Otherwise use realtime mode.
@@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.Mode = new_qc;
     vp8_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
@@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
 
   if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
 
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = pts_val;
-    ctx->pts_offset_initialized = 1;
-  }
-  pts_val -= ctx->pts_offset;
-
-  pick_quickcompress_mode(ctx, duration, deadline);
+  if (!res) res = pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // If no flags are set in the encode call, then use the frame flags as
@@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
-    YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
     size_t size, cx_data_sz;
     unsigned char *cx_data;
@@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     /* Convert API flags to internal codec lib flags */
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-    dst_time_stamp =
-        pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-    dst_end_time_stamp = (pts_val + (int64_t)duration) *
-                         ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts_val;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts_val < ctx->pts_offset) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts_val -= ctx->pts_offset;
+      if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      dst_time_stamp =
+          pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts_val > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts_val + (int64_t)duration;
+      if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
+      dst_end_time_stamp =
+          pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+
       res = image2yuvconfig(img, &sd);
 
       if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
           &dst_end_time_stamp, !img);
 
       if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) {
+        ctx->cpi->common.error.setjmp = 0;
         return VPX_CODEC_CORRUPT_FRAME;
       } else if (comp_data_state == -1) {
         break;
diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
index e81deaf4ea..fa7d7be403 100644
--- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
@@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
         pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
       }
-      pc->error.setjmp = 0;
+      pbi->common.error.setjmp = 0;
 #if CONFIG_MULTITHREAD
       if (pbi->restart_threads) {
         ctx->si.w = 0;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
index 261c316fd1..312092f190 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -8,10 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp8/vp8_ratectrl_rtc.h"
+
 #include <math.h>
+
 #include <new>
+
 #include "vp8/common/common.h"
-#include "vp8/vp8_ratectrl_rtc.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vp8/encoder/ratectrl.h"
 #include "vpx_ports/system_state.h"
@@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP(
 
 int VP8RateControlRTC::GetQP() const { return q_; }
 
+UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const {
+  VP8_COMMON *cm = &cpi_->common;
+  UVDeltaQP uv_delta_q;
+  uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q;
+  uv_delta_q.uvac_delta_q = cm->uvac_delta_q;
+  return uv_delta_q;
+}
+
 int VP8RateControlRTC::GetLoopfilterLevel() const {
   VP8_COMMON *cm = &cpi_->common;
   const double qp = q_;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
index 59fb607526..b458b5ce65 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -21,7 +21,6 @@ struct VP8_COMP;
 
 namespace libvpx {
 struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
   VP8RateControlRtcConfig() {
     memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
     memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
@@ -42,6 +41,9 @@ class VP8RateControlRTC {
   bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
+  // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest
+  // delta QP for UV.
+  UVDeltaQP GetUVDeltaQP() const;
   // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
   // level is calculated from frame qp.
   int GetLoopfilterLevel() const;
@@ -53,10 +55,10 @@ class VP8RateControlRTC {
   void PostEncodeUpdate(uint64_t encoded_frame_size);
 
  private:
-  VP8RateControlRTC() {}
+  VP8RateControlRTC() = default;
   bool InitRateControl(const VP8RateControlRtcConfig &cfg);
-  struct VP8_COMP *cpi_;
-  int q_;
+  struct VP8_COMP *cpi_ = nullptr;
+  int q_ = -1;
 };
 
 }  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
index 1cfc12f6fa..4c8fcf6989 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
@@ -13,7 +13,6 @@
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_util/vpx_thread.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
index 37762ca15a..1a93b97e56 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() { once(setup_rtcd_internal); }
+void vp9_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
index 3ecbd5417f..af3ff0e980 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
 add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-specialize qw/vp9_block_error_fp neon avx2 sse2/;
+specialize qw/vp9_block_error_fp neon sve avx2 sse2/;
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
@@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  specialize qw/vp9_block_error neon avx2 sse2/;
+  specialize qw/vp9_block_error neon sve avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
-  specialize qw/vp9_block_error neon avx2 msa sse2/;
+  specialize qw/vp9_block_error neon sve avx2 msa sse2/;
 }
 
 # fdct functions
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
index 8df18af3b8..24adbcbff0 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_reconinter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
index 5df0117f12..96c705d0d5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
@@ -12,6 +12,7 @@
 #define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
index c5892156f4..4fe680cefc 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -22,6 +22,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
@@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
+      worker->thread_name = "vpx tile worker";
       if (n < num_threads - 1 && !winterface->reset(worker)) {
         do {
           winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
index 5a7e9f9ab3..5c77df5002 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
   cm->error.setjmp = 0;
 
   vpx_get_worker_interface()->init(&pbi->lf_worker);
+  pbi->lf_worker.thread_name = "vpx lf worker";
 
   return pbi;
 }
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
index 2e198d552e..b3ee4eab5f 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
@@ -16,6 +16,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_thread_common.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
index 9a31f5a6d0..926ae87739 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
@@ -12,6 +12,7 @@
 #include <string.h>
 
 #include "vpx/vpx_integer.h"
+#include "vpx_util/vpx_pthread.h"
 
 #include "vp9/decoder/vp9_job_queue.h"
 
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
index bc23bf9c2c..59f71fb9ba 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
@@ -11,7 +11,7 @@
 #ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
 #define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
 
-#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_pthread.h"
 
 typedef struct {
   // Pointer to buffer base which contains the jobs
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
new file mode 100644
index 0000000000..78e7361d85
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz) {
+  int64x2_t err_v = vdupq_n_s64(0);
+  int64x2_t ssz_v = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err_v = vpx_dotq_s16(err_v, diff0, diff0);
+    err_v = vpx_dotq_s16(err_v, diff1, diff1);
+
+    ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
+    ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_v);
+  return horizontal_add_int64x2(err_v);
+}
+
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff, int block_size) {
+  int64x2_t err = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err = vpx_dotq_s16(err, diff0, diff0);
+    err = vpx_dotq_s16(err, diff1, diff1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return horizontal_add_int64x2(err);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
index 7fa00cd194..6542794667 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -11,8 +11,6 @@
 #ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
 #define VPX_VP9_ENCODER_VP9_BLOCK_H_
 
-#include "vpx_util/vpx_thread.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
index 42073f756c..ee0fcd8729 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
     alloc_tree_contexts(cm, tree, 4);
-    tree->leaf_split[0] = this_leaf++;
-    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+    tree->u.leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0];
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
       tree->block_size = square[square_index];
-      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+      for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++;
       ++pc_tree_index;
     }
     ++square_index;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
index 4e301cc17d..51e13ba654 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -90,7 +90,7 @@ typedef struct PC_TREE {
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
-  };
+  } u;
   // Obtained from a simple motion search. Used by the ML based partition search
   // speed feature.
   MV mv;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
index 46291f4868..b24c85f406 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -21,7 +21,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_ports/system_state.h"
-
+#include "vpx_util/vpx_pthread.h"
 #if CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
 #endif  // CONFIG_MISMATCH_DEBUG
@@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
       assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-                 pc_tree->leaf_split[0]);
+                 pc_tree->u.leaf_split[0]);
       } else {
         encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
+                  pc_tree->u.split[0]);
         encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
+                  subsize, pc_tree->u.split[1]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
+                  subsize, pc_tree->u.split[2]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+                  subsize, pc_tree->u.split[3]);
       }
       break;
   }
@@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
       assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                   pc_tree->split[0]);
+                   pc_tree->u.split[0]);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                   subsize, pc_tree->split[1]);
+                   subsize, pc_tree->u.split[1]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                   subsize, pc_tree->split[2]);
+                   subsize, pc_tree->u.split[2]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
-                   output_enabled, subsize, pc_tree->split[3]);
+                   output_enabled, subsize, pc_tree->u.split[3]);
       break;
   }
 
@@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                         subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
+                         subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
@@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
                          tp, mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist, i != 3,
-                         pc_tree->split[i]);
+                         pc_tree->u.split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         continue;
 
       save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      pc_tree->u.split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
+                       &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none,
                        INT_MAX, INT64_MAX);
 
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
 
       if (i != 3)
         encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+                  split_subsize, pc_tree->u.split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
@@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
       features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
 
       for (i = 0; i < 4; ++i) {
-        const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+        const int64_t this_rd = pc_tree->u.split[i]->none.rdcost;
         const int rd_valid = this_rd > 0 && this_rd < 1000000000;
         // Ratio between sub-block RD and whole block RD.
         features[feature_index++] =
@@ -3958,19 +3958,19 @@ static void store_superblock_info(
   }
   // recursively traverse partition tree when partition is split.
   assert(pc_tree->partitioning == PARTITION_SPLIT);
-  store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4, col_start_4x4, partition_info,
                         motion_vector_info);
-  store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4, col_start_4x4 + subblock_square_size_4x4,
                         partition_info, motion_vector_info);
-  store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4 + subblock_square_size_4x4, col_start_4x4,
                         partition_info, motion_vector_info);
-  store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride,
+  store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride,
                         subblock_square_size_4x4, num_unit_rows, num_unit_cols,
                         row_start_4x4 + subblock_square_size_4x4,
                         col_start_4x4 + subblock_square_size_4x4,
@@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       vp9_zero(pc_tree->mv);
     }
     if (bsize > BLOCK_8X8) {  // Store MV result as reference for subblocks.
-      for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+      for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv;
     }
   }
 
@@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_SPLIT
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
-  pc_tree->split[0]->none.rdcost = 0;
-  pc_tree->split[1]->none.rdcost = 0;
-  pc_tree->split[2]->none.rdcost = 0;
-  pc_tree->split[3]->none.rdcost = 0;
+  pc_tree->u.split[0]->none.rdcost = 0;
+  pc_tree->u.split[1]->none.rdcost = 0;
+  pc_tree->u.split[2]->none.rdcost = 0;
+  pc_tree->u.split[3]->none.rdcost = 0;
   if (do_split || must_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     load_pred_mv(x, ctx);
     if (bsize == BLOCK_8X8) {
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+        pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+                       pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist);
       if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         if (cpi->sf.prune_ref_frame_for_rect_partitions) {
-          const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
-          const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+          const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0];
+          const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1];
           for (i = 0; i < 4; ++i) {
             ref_frames_used[i] |= (1 << ref1);
             if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
@@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
 
-        pc_tree->split[i]->index = i;
+        pc_tree->u.split[i]->index = i;
         if (cpi->sf.prune_ref_frame_for_rect_partitions)
-          pc_tree->split[i]->none.rate = INT_MAX;
+          pc_tree->u.split[i]->none.rate = INT_MAX;
         found_best_rd = rd_pick_partition(
             cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-            &this_rdc, best_rdc_split, pc_tree->split[i]);
+            &this_rdc, best_rdc_split, pc_tree->u.split[i]);
 
         if (found_best_rd == 0) {
           sum_rdc.rdcost = INT64_MAX;
           break;
         } else {
           if (cpi->sf.prune_ref_frame_for_rect_partitions &&
-              pc_tree->split[i]->none.rate != INT_MAX) {
-            const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
-            const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+              pc_tree->u.split[i]->none.rate != INT_MAX) {
+            const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0];
+            const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1];
             ref_frames_used[i] |= (1 << ref1);
             if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
           }
@@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       }
       break;
     case PARTITION_SPLIT: {
-      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]);
       fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
-                        pc_tree->split[1]);
+                        pc_tree->u.split[1]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
-                        pc_tree->split[2]);
+                        pc_tree->u.split[2]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
-                        pc_tree->split[3]);
+                        pc_tree->u.split[3]);
       break;
     }
     default: break;
@@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   if (bsize > BLOCK_8X8) {
     BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
     int i;
-    for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->u.split[i], subsize);
   }
 }
 
@@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                           mi_col + x_idx, subsize, &this_rdc, 0,
-                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+      nonrd_pick_partition(
+          cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+          &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]);
 
       if (this_rdc.rate == INT_MAX) {
         vp9_rd_cost_reset(&sum_rdc);
@@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         subsize = get_subsize(bsize, PARTITION_SPLIT);
         nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                subsize, output_enabled, rd_cost,
-                               pc_tree->split[0]);
+                               pc_tree->u.split[0]);
         nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                                mi_col + hbs, subsize, output_enabled, &this_rdc,
-                               pc_tree->split[1]);
+                               pc_tree->u.split[1]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                                mi_row + hbs, mi_col, subsize, output_enabled,
-                               &this_rdc, pc_tree->split[2]);
+                               &this_rdc, pc_tree->u.split[2]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                                mi_row + hbs, mi_col + hbs, subsize,
-                               output_enabled, &this_rdc, pc_tree->split[3]);
+                               output_enabled, &this_rdc, pc_tree->u.split[3]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
-                            subsize, pc_tree->leaf_split[0]);
+                            subsize, pc_tree->u.leaf_split[0]);
         encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
-                    subsize, pc_tree->leaf_split[0]);
+                    subsize, pc_tree->u.leaf_split[0]);
       } else {
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
-                            output_enabled, dummy_cost, pc_tree->split[0]);
+                            output_enabled, dummy_cost, pc_tree->u.split[0]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                             mi_col + hbs, subsize, output_enabled, dummy_cost,
-                            pc_tree->split[1]);
+                            pc_tree->u.split[1]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                             mi_row + hbs, mi_col, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[2]);
+                            dummy_cost, pc_tree->u.split[2]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                             mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[3]);
+                            dummy_cost, pc_tree->u.split[3]);
       }
       break;
   }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
index fd213f1e6b..3b8b5345f1 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -31,12 +31,14 @@
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_pthread.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
 #if CONFIG_VP9_POSTPROC
@@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     cpi->external_resize = 1;
   }
 
-  if (cpi->initial_width) {
-    int new_mi_size = 0;
-    vp9_set_mb_mi(cm, cm->width, cm->height);
-    new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
-    if (cm->mi_alloc_size < new_mi_size) {
-      vp9_free_context_buffers(cm);
-      vp9_free_pc_tree(&cpi->td);
-      vpx_free(cpi->mbmi_ext_base);
-      alloc_compressor_data(cpi);
-      realloc_segmentation_maps(cpi);
-      cpi->initial_width = cpi->initial_height = 0;
-      cpi->external_resize = 0;
-    } else if (cm->mi_alloc_size == new_mi_size &&
-               (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      if (vp9_alloc_loop_filter(cm)) {
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate loop filter data");
-      }
+  int new_mi_size = 0;
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    vp9_free_context_buffers(cm);
+    vp9_free_pc_tree(&cpi->td);
+    vpx_free(cpi->mbmi_ext_base);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+    cpi->external_resize = 0;
+  } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+    if (vp9_alloc_loop_filter(cm)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate loop filter data");
     }
   }
 
@@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) {
         continue;
       }
 
-#if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         RefCntBuffer *new_fb_ptr = NULL;
         int force_scaling = 0;
@@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
         new_fb_ptr = &pool->frame_bufs[new_fb];
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
+#if CONFIG_VP9_HIGHBITDEPTH
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        cm->use_highbitdepth,
@@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                "Failed to allocate frame buffer");
           scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
                                  EIGHTTAP, 0);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
 #else
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
-        int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          force_scaling = 1;
-        }
-        if (new_fb == INVALID_IDX) return;
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        VP9_ENC_BORDER_IN_PIXELS,
@@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       } else {
         int buf_idx;
         RefCntBuffer *buf = NULL;
@@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 #endif
 }
 
+static YV12_BUFFER_CONFIG *svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->bit_depth == VPX_BITS_8) {
+      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                                 phase_scaler2);
+      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+                                 phase_scaler);
+    } else {
+      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+                             filter_type2, phase_scaler2);
+      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+                             filter_type, phase_scaler);
+    }
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                               phase_scaler2);
+    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
                                       uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
@@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     // result will be saved in scaled_temp and might be used later.
     const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
     const int phase_scaler2 = svc->downsample_filter_phase[1];
-    cpi->Source = vp9_svc_twostage_scale(
+    cpi->Source = svc_twostage_scale(
         cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     svc->scaled_one_half = 1;
@@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   // external rate control model.
   // This flag doesn't have any impact when external rate control is not used.
   int ext_rc_recode = 0;
-  // Maximal frame size allowed by the external rate control.
-  // case: 0, we ignore the max frame size limit, and encode with the qindex
-  // passed in by the external rate control model.
-  // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
-  // and may recode if undershoot/overshoot is seen.
-  // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
-  // case: -1, we take libvpx's decision for the max frame size, as well as
-  // the recode decision.
-  // Otherwise: if a specific size is given, libvpx's recode decision
-  // will respect the given size.
-  int ext_rc_max_frame_size = 0;
-  // Use VP9's decision of qindex. This flag is in use only in external rate
-  // control model to help determine whether to recode when
-  // |ext_rc_max_frame_size| is 0.
-  int ext_rc_use_default_q = 1;
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
@@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+        !cpi->tpl_with_external_rc &&
         (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
         cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
-      FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-      const int ref_frame_flags = get_ref_frame_flags(cpi);
-      RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
-      const RefCntBuffer *curr_frame_buf =
-          get_ref_cnt_buffer(cm, cm->new_fb_idx);
-      // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
-      // index 1 refers to the first encoding frame in a gf group.
-      // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
-      // See function define_gf_group_structure().
-      const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
-      get_ref_frame_bufs(cpi, ref_frame_bufs);
       codec_status = vp9_extrc_get_encodeframe_decision(
-          &cpi->ext_ratectrl, curr_frame_buf->frame_index,
-          cm->current_frame_coding_index, gf_group->index, update_type,
-          gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
-          &encode_frame_decision);
+          &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
       if (codec_status != VPX_CODEC_OK) {
         vpx_internal_error(&cm->error, codec_status,
                            "vp9_extrc_get_encodeframe_decision() failed");
@@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       // libvpx's default q.
       if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
         q = encode_frame_decision.q_index;
-        ext_rc_use_default_q = 0;
       }
-      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
     vp9_set_quantizer(cpi, q);
@@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 
     if (cpi->ext_ratectrl.ready &&
         (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
-      // In general, for the external rate control, we take the qindex provided
-      // as input and encode the frame with this qindex faithfully. However,
-      // in some extreme scenarios, the provided qindex leads to a massive
-      // overshoot of frame size. In this case, we fall back to VP9's decision
-      // to pick a new qindex and recode the frame. We return the new qindex
-      // through the API to the external model.
-      if (ext_rc_max_frame_size == 0) {
-        if (!ext_rc_use_default_q) break;
-      } else if (ext_rc_max_frame_size == -1) {
-        // Do nothing, fall back to libvpx's recode decision.
-      } else {
-        // Change the max frame size, used in libvpx's recode decision.
-        rc->max_frame_bandwidth = ext_rc_max_frame_size;
-      }
-      ext_rc_recode = 1;
+      break;
     }
 #if CONFIG_RATE_CTRL
     if (cpi->oxcf.use_simple_encode_api) {
@@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) {
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
-    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
-    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
-    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
-  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
-      cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (cm->bit_depth == VPX_BITS_8) {
-      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
-                                 phase_scaler2);
-      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
-                                 phase_scaler);
-    } else {
-      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
-                             filter_type2, phase_scaler2);
-      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
-                             filter_type, phase_scaler);
-    }
-#else
-    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
-                               phase_scaler2);
-    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    return scaled;
-  } else {
-    return unscaled;
-  }
-}
-
 YV12_BUFFER_CONFIG *vp9_scale_if_required(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
@@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
+    if (!(cpi->ext_ratectrl.ready &&
+          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+          cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) {
+      // This assert only makes sense when not using external RC.
+      assert(arf_src_index <= rc->frames_to_key);
+    }
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
       cpi->alt_ref_source = source;
 
@@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
     vp9_init_tpl_buffer(cpi);
-    vp9_estimate_qp_gop(cpi);
+    vp9_estimate_tpl_qp_gop(cpi);
     vp9_setup_tpl_stats(cpi);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
index 91df538821..898855d10d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -25,6 +25,7 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 
@@ -1062,7 +1063,7 @@ typedef struct VP9_COMP {
    */
   uint64_t frame_component_time[kTimingComponents];
 #endif
-  // Flag to indicate if QP and GOP for TPL is controlled by external RC.
+  // Flag to indicate if QP and GOP for TPL are controlled by external RC.
   int tpl_with_external_rc;
 } VP9_COMP;
 
@@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                              YV12_BUFFER_CONFIG *dst);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
-    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
-    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
-    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
-
 YV12_BUFFER_CONFIG *vp9_scale_if_required(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
index a8d1cb7a7a..c3b79507e6 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -17,6 +17,7 @@
 #include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_pthread.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   int i, j, k, l, m, n;
@@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) {
     vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
-  return 0;
+  return 1;
 }
 
 static int get_max_tile_cols(VP9_COMP *cpi) {
@@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
 
     ++cpi->num_workers;
     winterface->init(worker);
+    worker->thread_name = "vpx enc worker";
 
     if (i < num_workers - 1) {
       thread_data->cpi = cpi;
@@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   create_enc_workers(cpi, num_workers);
 
   for (i = 0; i < num_workers; i++) {
-    EncWorkerData *thread_data;
-    thread_data = &cpi->tile_thr_data[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) {
                                         this_tile, &best_ref_mv, mb_row);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
@@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) {
                                         mb_col_start, mb_col_end);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
@@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
       vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
     }
   }
-  return 0;
+  return 1;
 }
 
 void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
index 4c192da515..359cdd1290 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -11,13 +11,14 @@
 #ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
 #define VPX_VP9_ENCODER_VP9_ETHREAD_H_
 
+#include "vpx_util/vpx_pthread.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define MAX_NUM_TILE_COLS (1 << 6)
 #define MAX_NUM_TILE_ROWS 4
-#define MAX_NUM_THREADS 80
 
 struct VP9_COMP;
 struct ThreadData;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
index 4664e8c5e2..7b0d89acd2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 }
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
-  if (ext_ratectrl == NULL) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
-    vpx_rc_status_t rc_status;
-    vpx_rc_encodeframe_info_t encode_frame_info;
-    encode_frame_info.show_index = show_index;
-    encode_frame_info.coding_index = coding_index;
-    encode_frame_info.gop_index = gop_index;
-    encode_frame_info.frame_type = extrc_get_frame_type(update_type);
-    encode_frame_info.gop_size = gop_size;
-    encode_frame_info.use_alt_ref = use_alt_ref;
-
-    vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
-                           encode_frame_info.ref_frame_coding_indexes,
-                           encode_frame_info.ref_frame_valid_list);
+  assert(ext_ratectrl != NULL);
+  assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0);
 
-    rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
-        ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
-    if (rc_status == VPX_RC_ERROR) {
-      return VPX_CODEC_ERROR;
-    }
+  vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+      ext_ratectrl->model, gop_index, encode_frame_decision);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
   }
   return VPX_CODEC_OK;
 }
@@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
 }
 
 vpx_codec_err_t vp9_extrc_get_gop_decision(
-    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
-    vpx_rc_gop_decision_t *gop_decision) {
+    EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) {
   vpx_rc_status_t rc_status;
   if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
       (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
-                                                   gop_info, gop_decision);
-  if (gop_decision->use_alt_ref) {
-    const int arf_constraint =
-        gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
-        gop_decision->gop_coding_frames < gop_info->lag_in_frames;
-    if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
-  }
-  // TODO(chengchen): Take min and max gf interval from the model
-  // and overwrite libvpx's decision so that we can get rid
-  // of one of the checks here.
-  if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
-      gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
-          gop_info->max_gf_interval) {
-    return VPX_CODEC_ERROR;
-  }
+  rc_status =
+      ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision);
   if (rc_status == VPX_RC_ERROR) {
     return VPX_CODEC_ERROR;
   }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
index b04580c1d4..d1be5f2aef 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
                                          const VpxTplGopStats *tpl_gop_stats);
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
-    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
 vpx_codec_err_t vp9_extrc_update_encodeframe_result(
@@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
     uint32_t input_bit_depth, const int actual_encoding_qindex);
 
-vpx_codec_err_t vp9_extrc_get_gop_decision(
-    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
-    vpx_rc_gop_decision_t *gop_decision);
+vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl,
+                                           vpx_rc_gop_decision_t *gop_decision);
 
 vpx_codec_err_t vp9_extrc_get_frame_rdmult(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
index dcb62e8768..69261ac65f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                         dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
                         et_uv, el_uv, eb_uv, er_uv, chroma_step);
 }
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw) {
-  // If the side is not touching the bounder then don't extend.
-  const int et_y = srcy ? 0 : dst->border;
-  const int el_y = srcx ? 0 : dst->border;
-  const int eb_y = srcy + srch != src->y_height
-                       ? 0
-                       : dst->border + dst->y_height - src->y_height;
-  const int er_y = srcx + srcw != src->y_width
-                       ? 0
-                       : dst->border + dst->y_width - src->y_width;
-  const int src_y_offset = srcy * src->y_stride + srcx;
-  const int dst_y_offset = srcy * dst->y_stride + srcx;
-
-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
-  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
-  // detect nv12 colorspace
-  const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
-                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
-                        et_y, el_y, eb_y, er_y, 1);
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
-                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
-                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
index 4ba7fc95e3..21d7e68b9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -21,9 +21,6 @@ extern "C" {
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst);
 
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
index a9cdf5353f..58b9b7ba61 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -37,6 +37,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/variance.h"
 
 #define OUTPUT_FPF 0
@@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         v_fn_ptr.vf = get_block_variance_fn(bsize);
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         this_motion_error =
@@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     }
   }
 #endif
-  // If the external rate control model for GOP is used, the gop decisions
-  // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
-  // will be overwritten.
-  if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
-      cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
-    vpx_codec_err_t codec_status;
-    vpx_rc_gop_decision_t gop_decision;
-    vpx_rc_gop_info_t gop_info;
-    gop_info.min_gf_interval = rc->min_gf_interval;
-    gop_info.max_gf_interval = rc->max_gf_interval;
-    gop_info.active_min_gf_interval = active_gf_interval.min;
-    gop_info.active_max_gf_interval = active_gf_interval.max;
-    gop_info.allow_alt_ref = allow_alt_ref;
-    gop_info.is_key_frame = is_key_frame;
-    gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
-    gop_info.frames_since_key = rc->frames_since_key;
-    gop_info.frames_to_key = rc->frames_to_key;
-    gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
-    gop_info.show_index = cm->current_video_frame;
-    gop_info.coding_index = cm->current_frame_coding_index;
-    gop_info.gop_global_index = rc->gop_global_index;
-
-    codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
-                                              &gop_decision);
-    if (codec_status != VPX_CODEC_OK) {
-      vpx_internal_error(&cm->error, codec_status,
-                         "vp9_extrc_get_gop_decision() failed");
-    }
-    gop_coding_frames = gop_decision.gop_coding_frames;
-    use_alt_ref = gop_decision.use_alt_ref;
-  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   else
     twopass->fr_content_type = FC_NORMAL;
 
-  // Keyframe and section processing.
-  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
-    // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, show_idx);
+  // If the external rate control model for GOP is used, the gop decisions
+  // are overwritten, including whether to use key frame in this GF group,
+  // GF group length, and whether to use arf.
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL &&
+      rc->frames_till_gf_update_due == 0) {
+    vpx_codec_err_t codec_status;
+    vpx_rc_gop_decision_t gop_decision;
+    codec_status =
+        vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    if (gop_decision.use_key_frame) {
+      cpi->common.frame_type = KEY_FRAME;
+      rc->frames_since_key = 0;
+      // Clear the alt ref active flag and last group multi arf flags as they
+      // can never be set for a key frame.
+      rc->source_alt_ref_active = 0;
+      // KF is always a GF so clear frames till next gf counter.
+      rc->frames_till_gf_update_due = 0;
+    }
+
+    // A new GF group
+    if (rc->frames_till_gf_update_due == 0) {
+      vp9_zero(twopass->gf_group);
+      ++rc->gop_global_index;
+      if (gop_decision.use_alt_ref) {
+        rc->source_alt_ref_pending = 1;
+      }
+      rc->baseline_gf_interval =
+          gop_decision.gop_coding_frames - rc->source_alt_ref_pending;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      define_gf_group_structure(cpi);
+    }
   } else {
-    cm->frame_type = INTER_FRAME;
-  }
+    // Keyframe and section processing.
+    if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+      // Define next KF group and assign bits to it.
+      find_next_key_frame(cpi, show_idx);
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
 
-  // Define a new GF/ARF group. (Should always enter here for key frames).
-  if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, show_idx);
+    // Define a new GF/ARF group. (Should always enter here for key frames).
+    if (rc->frames_till_gf_update_due == 0) {
+      define_gf_group(cpi, show_idx);
 
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
 #if ARF_STATS_OUTPUT
-    {
-      FILE *fpfile;
-      fpfile = fopen("arf.stt", "a");
-      ++arf_count;
-      fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
-              cm->current_video_frame, rc->frames_till_gf_update_due,
-              rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
-
-      fclose(fpfile);
-    }
+      {
+        FILE *fpfile;
+        fpfile = fopen("arf.stt", "a");
+        ++arf_count;
+        fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+                cm->current_video_frame, rc->frames_till_gf_update_due,
+                rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
+
+        fclose(fpfile);
+      }
 #endif
+    }
   }
 
   vp9_configure_buffer_updates(cpi, gf_group->index);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
index 97838c38e6..b6be4f88ac 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -9,6 +9,7 @@
  */
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vpx_config.h"
 
@@ -81,7 +82,6 @@ bail:
   return NULL;
 }
 
-#define USE_PARTIAL_COPY 0
 int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
   return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
 }
@@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        vpx_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
-  int row, col, active_end;
-  int mb_rows = (src->y_height + 15) >> 4;
-  int mb_cols = (src->y_width + 15) >> 4;
-#endif
   int width = src->y_crop_width;
   int height = src->y_crop_height;
   int uv_width = src->uv_crop_width;
@@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                    height != buf->img.y_crop_height ||
                    uv_width != buf->img.uv_crop_width ||
                    uv_height != buf->img.uv_crop_height;
-  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
-                      uv_width > buf->img.uv_width ||
-                      uv_height > buf->img.uv_height;
+  larger_dimensions =
+      width > buf->img.y_crop_width || height > buf->img.y_crop_height ||
+      uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height;
   assert(!larger_dimensions || new_dimensions);
 
-#if USE_PARTIAL_COPY
-  // TODO(jkoleszar): This is disabled for now, as
-  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
-  // Only do this partial copy if the following conditions are all met:
-  // 1. Lookahead queue has has size of 1.
-  // 2. Active map is provided.
-  // 3. This is not a key frame, golden nor altref frame.
-  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
-    for (row = 0; row < mb_rows; ++row) {
-      col = 0;
-
-      while (1) {
-        // Find the first active macroblock in this row.
-        for (; col < mb_cols; ++col) {
-          if (active_map[col]) break;
-        }
-
-        // No more active macroblock in this row.
-        if (col == mb_cols) break;
-
-        // Find the end of active region in this row.
-        active_end = col;
-
-        for (; active_end < mb_cols; ++active_end) {
-          if (!active_map[active_end]) break;
-        }
-
-        // Only copy this active region.
-        vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
-                                            16, (active_end - col) << 4);
-
-        // Start again from the end of this active region.
-        col = active_end;
-      }
-
-      active_map += mb_cols;
-    }
-  } else {
-#endif
-    if (larger_dimensions) {
-      YV12_BUFFER_CONFIG new_img;
-      memset(&new_img, 0, sizeof(new_img));
-      if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y,
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 use_highbitdepth,
+                               use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, 0))
-        return 1;
-      vpx_free_frame_buffer(&buf->img);
-      buf->img = new_img;
-    } else if (new_dimensions) {
-      buf->img.y_crop_width = src->y_crop_width;
-      buf->img.y_crop_height = src->y_crop_height;
-      buf->img.uv_crop_width = src->uv_crop_width;
-      buf->img.uv_crop_height = src->uv_crop_height;
-      buf->img.subsampling_x = src->subsampling_x;
-      buf->img.subsampling_y = src->subsampling_y;
-    }
-    // Partial copy not implemented yet
-    vp9_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+                               VP9_ENC_BORDER_IN_PIXELS, 0))
+      return 1;
+    vpx_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    buf->img.y_width = src->y_width;
+    buf->img.y_height = src->y_height;
+    buf->img.uv_width = src->uv_width;
+    buf->img.uv_height = src->uv_height;
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
   }
-#endif
+  vp9_copy_and_extend_frame(src, &buf->img);
 
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
index 0843cd97e4..6e124f9944 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "vpx_util/vpx_pthread.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_multi_thread.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
index 3f4fe6957b..d37e020b0a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
index 62d6b93028..76d5435e60 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -35,6 +35,7 @@
 #include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
 
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_ext_ratectrl.h"
@@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
   return q;
 }
 
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
-                                         int *top_index, int gf_group_index) {
+int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                      int *top_index, int gf_group_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
         q = active_worst_quality;
     }
   }
-  clamp(q, active_best_quality, active_worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
     else
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
-                                      gf_group_index);
+    q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+                                          gf_group_index);
   }
   if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
   }
 }
 
-void vp9_estimate_qp_gop(VP9_COMP *cpi) {
-  int gop_length = cpi->twopass.gf_group.gf_group_size;
-  int bottom_index, top_index;
-  int idx;
-  const int gf_index = cpi->twopass.gf_group.index;
-  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
-  const int refresh_frame_context = cpi->common.refresh_frame_context;
-
-  for (idx = 1; idx <= gop_length; ++idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
-    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
-    cpi->twopass.gf_group.index = idx;
-    vp9_rc_set_frame_target(cpi, target_rate);
-    vp9_configure_buffer_updates(cpi, idx);
-    if (cpi->tpl_with_external_rc) {
-      if (cpi->ext_ratectrl.ready &&
-          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
-          cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
-        VP9_COMMON *cm = &cpi->common;
-        vpx_codec_err_t codec_status;
-        const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-        vpx_rc_encodeframe_decision_t encode_frame_decision;
-        FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
-        RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
-        const RefCntBuffer *curr_frame_buf =
-            get_ref_cnt_buffer(cm, cm->new_fb_idx);
-        // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
-        // index 1 refers to the first encoding frame in a gf group.
-        // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
-        // See function define_gf_group_structure().
-        const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
-        const int frame_coding_index = cm->current_frame_coding_index + idx - 1;
-        get_ref_frame_bufs(cpi, ref_frame_bufs);
-        codec_status = vp9_extrc_get_encodeframe_decision(
-            &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index,
-            gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref,
-            ref_frame_bufs, 0 /*ref_frame_flags is not used*/,
-            &encode_frame_decision);
-        if (codec_status != VPX_CODEC_OK) {
-          vpx_internal_error(&cm->error, codec_status,
-                             "vp9_extrc_get_encodeframe_decision() failed");
-        }
-        tpl_frame->base_qindex = encode_frame_decision.q_index;
-      }
-    } else {
-      tpl_frame->base_qindex =
-          rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
-      tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
-    }
-  }
-  // Reset the actual index and frame update
-  cpi->twopass.gf_group.index = gf_index;
-  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
-  cpi->common.refresh_frame_context = refresh_frame_context;
-  vp9_configure_buffer_updates(cpi, gf_index);
-}
-
 void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
@@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
     }
     // For temporal layers, reset the rate control parametes across all
-    // temporal layers. If the first_spatial_layer_to_encode > 0, then this
-    // superframe has skipped lower base layers. So in this case we should also
-    // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
+    // temporal layers.
+    // If the first_spatial_layer_to_encode > 0, then this superframe has
+    // skipped lower base layers. So in this case we should also reset and
+    // force max-q for spatial layers < first_spatial_layer_to_encode.
+    // For the case of no inter-layer prediction on delta frames: reset and
+    // force max-q for all spatial layers, to avoid excessive frame drops.
     if (cpi->use_svc) {
       int tl = 0;
       int sl = 0;
       SVC *svc = &cpi->svc;
-      for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
+      int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode);
+      if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON)
+        num_spatial_layers = svc->number_spatial_layers;
+      for (sl = 0; sl < num_spatial_layers; ++sl) {
         for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
           const int layer =
               LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
index 48c49e937e..0c61ad3461 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
 void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
 
-void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
-
 void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
 
 void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
 
+int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi,
+                                      int *bottom_index, int *top_index,
+                                      int gf_group_index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
index 974e43c90f..447136ed84 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
-static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) {
   if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
     int_mv cur_fullpel_mv, prev_fullpel_mv;
     cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
index b8910370e0..048ab8732d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -18,9 +18,12 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_scan.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_tpl_model.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 
 static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
                            const GF_GROUP *gf_group, int *tpl_group_frames) {
@@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
       tpl_block_stats_ptr->col = mi_col * 8;
       tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
       tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
-      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
-      tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+      // inter/intra_cost here is calculated with SATD which should be close
+      // enough to be used as inter/intra_pred_error
+      tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost;
+      tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
       tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
       tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
       tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
@@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
   tpl_stats->intra_cost = VPXMAX(
       1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  if (best_rf_idx >= 0) {
+    tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  }
   tpl_stats->mv.as_int = best_mv.as_int;
   *ref_frame_idx = best_rf_idx;
 }
@@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
 }
 #endif  // CONFIG_RATE_CTRL
 
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) {
+  int gop_length = cpi->twopass.gf_group.gf_group_size;
+  int bottom_index, top_index;
+  int idx;
+  const int gf_index = cpi->twopass.gf_group.index;
+  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+  const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+  for (idx = 1; idx <= gop_length; ++idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+    cpi->twopass.gf_group.index = idx;
+    vp9_rc_set_frame_target(cpi, target_rate);
+    vp9_configure_buffer_updates(cpi, idx);
+    if (cpi->tpl_with_external_rc) {
+      VP9_COMMON *cm = &cpi->common;
+      if (cpi->ext_ratectrl.ready &&
+          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+          cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+        vpx_codec_err_t codec_status;
+        const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+        vpx_rc_encodeframe_decision_t encode_frame_decision;
+        codec_status = vp9_extrc_get_encodeframe_decision(
+            &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision);
+        if (codec_status != VPX_CODEC_OK) {
+          vpx_internal_error(&cm->error, codec_status,
+                             "vp9_extrc_get_encodeframe_decision() failed");
+        }
+        tpl_frame->base_qindex = encode_frame_decision.q_index;
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                           "The external rate control library is not set "
+                           "properly for TPL pass.");
+      }
+    } else {
+      tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass(
+          cpi, &bottom_index, &top_index, idx);
+      tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+    }
+  }
+  // Reset the actual index and frame update
+  cpi->twopass.gf_group.index = gf_index;
+  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+  cpi->common.refresh_frame_context = refresh_frame_context;
+  vp9_configure_buffer_updates(cpi, gf_index);
+}
+
 void vp9_setup_tpl_stats(VP9_COMP *cpi) {
   GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
     mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
   }
 
-  // TPL stats has extra frames from next GOP. Trim those extra frames for
-  // Qmode.
-  trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
-
   if (cpi->ext_ratectrl.ready &&
       cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+    // Intra search on key frame
+    if (gf_picture[0].update_type == KF_UPDATE) {
+      mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize);
+    }
+    // TPL stats has extra frames from next GOP. Trim those extra frames for
+    // Qmode.
+    trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats,
+                   extended_frame_count);
     const vpx_codec_err_t codec_status =
         vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
     if (codec_status != VPX_CODEC_OK) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
index 04beb22610..de0ac39a1f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -31,6 +31,7 @@ typedef struct GF_PICTURE {
 void vp9_init_tpl_buffer(VP9_COMP *cpi);
 void vp9_setup_tpl_stats(VP9_COMP *cpi);
 void vp9_free_tpl_buffer(VP9_COMP *cpi);
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi);
 
 void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
                       TX_SIZE tx_size);
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 94506aad0f..628dc4fead 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
       scale_plane_1_to_2_phase_0(
           src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
           src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
-      scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
-                                 dst->uv_stride, src_w / 2, src_h / 2,
-                                 vp9_filter_kernels[filter_type][8],
-                                 temp_buffer);
-      scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
-                                 dst->uv_stride, src_w / 2, src_h / 2,
-                                 vp9_filter_kernels[filter_type][8],
-                                 temp_buffer);
+      const int src_uv_w = src->uv_crop_width;
+      const int src_uv_h = src->uv_crop_height;
+      scale_plane_1_to_2_phase_0(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
       free(temp_buffer);
     }
   }
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
index fd81bce7b5..942c15ce49 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@@ -12,10 +12,12 @@
 #include <new>
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
 
 namespace libvpx {
 
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
index 85005c5474..4c39255886 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@@ -12,43 +12,34 @@
 #define VPX_VP9_RATECTRL_RTC_H_
 
 #include <cstdint>
+#include <cstring>
+#include <limits>
 #include <memory>
 
-#include "vp9/common/vp9_enums.h"
-#include "vp9/vp9_iface_common.h"
-#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/vp9_cx_iface.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
-#include "vpx_mem/vpx_mem.h"
 
 struct VP9_COMP;
 
 namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
   VP9RateControlRtcConfig() {
-    ss_number_layers = 1;
-    vp9_zero(max_quantizers);
-    vp9_zero(min_quantizers);
-    vp9_zero(scaling_factor_den);
-    vp9_zero(scaling_factor_num);
-    vp9_zero(layer_target_bitrate);
-    vp9_zero(ts_rate_decimator);
+    memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator));
     scaling_factor_num[0] = 1;
     scaling_factor_den[0] = 1;
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
-    max_consec_drop = INT_MAX;
   }
 
   // Number of spatial layers
-  int ss_number_layers;
-  int max_quantizers[VPX_MAX_LAYERS];
-  int min_quantizers[VPX_MAX_LAYERS];
-  int scaling_factor_num[VPX_SS_MAX_LAYERS];
-  int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int ss_number_layers = 1;
+  int max_quantizers[VPX_MAX_LAYERS] = {};
+  int min_quantizers[VPX_MAX_LAYERS] = {};
+  int scaling_factor_num[VPX_SS_MAX_LAYERS] = {};
+  int scaling_factor_den[VPX_SS_MAX_LAYERS] = {};
   // This is only for SVC for now.
-  int max_consec_drop;
+  int max_consec_drop = std::numeric_limits<int>::max();
 };
 
 struct VP9FrameParamsQpRTC {
@@ -105,9 +96,9 @@ class VP9RateControlRTC {
                         const VP9FrameParamsQpRTC &frame_params);
 
  private:
-  VP9RateControlRTC() {}
+  VP9RateControlRTC() = default;
   bool InitRateControl(const VP9RateControlRtcConfig &cfg);
-  struct VP9_COMP *cpi_;
+  struct VP9_COMP *cpi_ = nullptr;
 };
 
 }  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc
index 2e6f9a4513..5e565d1b1a 100644
--- a/media/libvpx/libvpx/vp9/simple_encode.cc
+++ b/media/libvpx/libvpx/vp9/simple_encode.cc
@@ -8,8 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
+#include <stdlib.h>
+
 #include <memory>
 #include <vector>
+
 #include "./ivfenc.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_enums.h"
@@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() {
   use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
 #endif
   vpx_image_t img;
+  if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+    fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+    abort();
+  }
   vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
   rewind(in_file_);
   impl_ptr_->first_pass_stats.clear();
@@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() {
   vp9_set_first_pass_stats(&oxcf, &stats);
   assert(impl_ptr_->cpi == nullptr);
   impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+  if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+    fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+    abort();
+  }
   vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_,
                 frame_height_, 1);
 
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
index 8df04f29f0..fe62bac5f2 100644
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -17,6 +19,7 @@
 #include "vpx_dsp/psnr.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
 #include "vpx_util/vpx_timestamp.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
@@ -110,7 +113,6 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp9_extracfg extra_cfg;
-  vpx_rational64_t timestamp_ratio;
   vpx_codec_pts_t pts_offset;
   unsigned char pts_offset_initialized;
   VP9EncoderConfig oxcf;
@@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
-  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 
     if (res == VPX_CODEC_OK) {
       priv->pts_offset_initialized = 0;
-      // TODO(angiebird): Replace priv->timestamp_ratio by
-      // oxcf->g_timebase_in_ts
-      priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase);
-
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
 #if CONFIG_VP9_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
@@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   MODE new_mode = BEST;
 
 #if CONFIG_REALTIME_ONLY
@@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
         // Convert duration parameter from stream timebase to microseconds.
-        uint64_t duration_us;
-
         VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
                           (TICKS_PER_SEC % 1000000) == 0);
 
-        duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
-                      (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+        if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) {
+          ERROR("duration is too big");
+        }
+        uint64_t duration_us = duration *
+                               (uint64_t)ctx->oxcf.g_timebase_in_ts.num /
+                               ((uint64_t)ctx->oxcf.g_timebase_in_ts.den *
+                                (TICKS_PER_SEC / 1000000));
 
         // If the deadline is more that the duration this frame is to be shown,
         // use good quality mode. Otherwise use realtime mode.
@@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 // Turn on to test if supplemental superframe data breaks decoding
@@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
                            .is_key_frame))
     flags |= VPX_FRAME_IS_KEY;
 
+  if (!cpi->common.show_frame) {
+    flags |= VPX_FRAME_IS_INVISIBLE;
+  }
+
   if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
 
   return flags;
@@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
   volatile vpx_enc_frame_flags_t flags = enc_flags;
   volatile vpx_codec_pts_t pts = pts_val;
   VP9_COMP *const cpi = ctx->cpi;
-  const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+  const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts;
   size_t data_sz;
   vpx_codec_cx_pkt_t pkt;
   memset(&pkt, 0, sizeof(pkt));
@@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  if (!ctx->pts_offset_initialized) {
-    ctx->pts_offset = pts;
-    ctx->pts_offset_initialized = 1;
+  res = pick_quickcompress_mode(ctx, duration, deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
   }
-  pts -= ctx->pts_offset;
-
-  pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // Handle Flags
@@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (res == VPX_CODEC_OK) {
     unsigned int lib_flags = 0;
-    YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
-    cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1);
-    cpi->svc.time_stamp_superframe = dst_time_stamp;
-
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts < ctx->pts_offset) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts -= ctx->pts_offset;
+      if (pts > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      const int64_t dst_time_stamp =
+          timebase_units_to_ticks(timebase_in_ts, pts);
+
+      cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1);
+      cpi->svc.time_stamp_superframe = dst_time_stamp;
+
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts + (int64_t)duration;
+      if (pts_end > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
       const int64_t dst_end_time_stamp =
-          timebase_units_to_ticks(timestamp_ratio, pts + duration);
+          timebase_units_to_ticks(timebase_in_ts, pts_end);
       res = image2yuvconfig(img, &sd);
 
       if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       if (cx_data_sz < ctx->cx_data_sz / 2) {
         vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
                            "Compressed data buffer too small");
-        return VPX_CODEC_ERROR;
       }
     }
 
@@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // compute first pass stats
       if (img) {
         int ret;
+        int64_t dst_time_stamp;
         int64_t dst_end_time_stamp;
         vpx_codec_cx_pkt_t fps_pkt;
         ENCODE_FRAME_RESULT encode_frame_result;
@@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 #endif  // !CONFIG_REALTIME_ONLY
     } else {
       ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_time_stamp;
       int64_t dst_end_time_stamp;
       vp9_init_encode_frame_result(&encode_frame_result);
       while (cx_data_sz >= ctx->cx_data_sz / 2 &&
@@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
             if (ctx->output_cx_pkt_cb.output_cx_pkt) {
               pkt.kind = VPX_CODEC_CX_FRAME_PKT;
               pkt.data.frame.pts =
-                  ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+                  ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
                   ctx->pts_offset;
               pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-                  timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+                  timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
               pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
               pkt.data.frame.buf = ctx->pending_cx_data;
               pkt.data.frame.sz = size;
@@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           // Add the frame packet to the list of returned packets.
           pkt.kind = VPX_CODEC_CX_FRAME_PKT;
           pkt.data.frame.pts =
-              ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+              ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
               ctx->pts_offset;
           pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-              timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+              timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
           pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
           pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
           pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
@@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
     ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
     ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+    ratectrl_config.base_qp = oxcf->cq_level;
 
     if (oxcf->rc_mode == VPX_VBR) {
       ratectrl_config.rc_mode = VPX_RC_VBR;
@@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height,
   return enc_cfg;
 }
 
-static vp9_extracfg get_extra_cfg() {
+static vp9_extracfg get_extra_cfg(void) {
   vp9_extracfg extra_cfg = default_extra_cfg;
   return extra_cfg;
 }
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
index 860f721dc5..7567910b9b 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
@@ -19,7 +19,6 @@
 #include "vpx/vpx_decoder.h"
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk
index 44790ef6a4..7a0e2d8d1f 100644
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@@ -140,6 +140,7 @@ endif
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+VP9_CX_SRCS-$(HAVE_SVE)  += encoder/arm/neon/vp9_error_sve.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
index 01d64b14b7..2643b5578a 100644
--- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -22,8 +22,14 @@ enum class FrameDropDecision {
   kDrop,  // Frame is dropped.
 };
 
+struct UVDeltaQP {
+  // For the UV channel: the QP for the dc/ac value is given as
+  // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers.
+  int uvdc_delta_q;
+  int uvac_delta_q;
+};
+
 struct VpxRateControlRtcConfig {
- public:
   VpxRateControlRtcConfig() {
     width = 1280;
     height = 720;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
index 017525aeee..001d854abe 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
@@ -14,6 +14,7 @@
  */
 #include <assert.h>
 #include <limits.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vp8/common/blockd.h"
@@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
   while (0)
 
 #else
-static void FLOATING_POINT_INIT() {}
-static void FLOATING_POINT_RESTORE() {}
+static void FLOATING_POINT_INIT(void) {}
+static void FLOATING_POINT_RESTORE(void) {}
 #endif
 
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
@@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
     res = VPX_CODEC_ERROR;
   else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
+#if ULONG_MAX > UINT32_MAX
+  else if (duration > UINT32_MAX || deadline > UINT32_MAX)
+    res = VPX_CODEC_INVALID_PARAM;
+#endif
   else {
     unsigned int num_enc = ctx->priv->enc.total_encoders;
 
diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c
index f9f0dd6025..3f7ff74244 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_image.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_image.c
@@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
 
+  if (fmt == VPX_IMG_FMT_NONE) goto fail;
+
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
 
@@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   /* Get chroma shift values for this format */
   // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
-  // one time.
+  // once.
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
index 62c2a9c857..b0687a8135 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
@@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
                   "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
                   " %" PRId64 " %d\n",
                   block_stats.inter_cost, block_stats.intra_cost,
-                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
-                  block_stats.recrf_rate, block_stats.ref_frame_index));
+                  block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist,
+                  block_stats.srcrf_rate, block_stats.ref_frame_index));
     }
   }
 
@@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
                  " %" SCNd64 " %d\n",
                  &block_stats->inter_cost, &block_stats->intra_cost,
                  &block_stats->mv_c, &block_stats->mv_r,
-                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->srcrf_dist, &block_stats->srcrf_rate,
                  &block_stats->ref_frame_index),
           7);
     }
diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h
index b12938d3d8..dfdbb3c770 100644
--- a/media/libvpx/libvpx/vpx/vp8cx.h
+++ b/media/libvpx/libvpx/vpx/vp8cx.h
@@ -772,6 +772,8 @@ enum vp8e_enc_control_id {
   /*!\brief Codec control to use external RC to control TPL.
    *
    * This will use external RC to control the QP and GOP structure for TPL.
+   * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero.
+   * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set.
    *
    * Supported in codecs: VP9
    */
diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h
index 18e3862bd7..809a097d94 100644
--- a/media/libvpx/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_encoder.h
@@ -31,7 +31,6 @@ extern "C" {
 
 #include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_ext_ratectrl.h"
-#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,10 +56,15 @@ extern "C" {
  * must be bumped.  Examples include, but are not limited to, changing
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
+ *
+ * \note
+ * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component
+ * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses
+ * vpx_rc_funcs_t.
  */
-#define VPX_ENCODER_ABI_VERSION                                \
-  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
-   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+  (18 + VPX_CODEC_ABI_VERSION + \
+   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
  *     The buffer was set successfully.
  * \retval #VPX_CODEC_INVALID_PARAM
  *     A parameter was NULL, the image format is unsupported, etc.
+ *
+ * \note
+ * `duration` and `deadline` are of the unsigned long type, which can be 32
+ * or 64 bits. `duration` and `deadline` must be less than or equal to
+ * UINT32_MAX so that their ranges are independent of the size of unsigned
+ * long.
  */
 vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
                                           const vpx_fixed_buf_t *buf,
diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
index 46d290dff4..ba12e4f83b 100644
--- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
@@ -26,7 +26,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (7)
+#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t;
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
- *
- * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
- *
- * If max_frame_size = 0, the encoding ignores max frame size limit.
- * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
- * If the encoded frame size is larger than max_frame_size, the frame is
- * recoded to meet the size limit, following VP9's recoding principles.
  */
 typedef struct vpx_rc_encodeframe_decision {
-  int q_index;        /**< Quantizer step index [0..255]*/
-  int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
+  int q_index; /**< Quantizer step index [0..255]*/
+  int rdmult;  /**< Frame level Lagrangian multiplier*/
 } vpx_rc_encodeframe_decision_t;
 
 /*!\brief Information for the frame to be encoded.
@@ -322,6 +315,7 @@ typedef struct vpx_rc_config {
   vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
   int overshoot_percent;     /**< for VBR mode only */
   int undershoot_percent;    /**< for VBR mode only */
+  int base_qp;               /**< base QP for leaf frames, 0-255 */
 } vpx_rc_config_t;
 
 /*!\brief Information passed to the external rate control model to
@@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info {
 typedef struct vpx_rc_gop_decision {
   int gop_coding_frames; /**< The number of frames of this GOP */
   int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+  int use_key_frame;     /**< Whether to set key frame for this GOP */
 } vpx_rc_gop_decision_t;
 
 /*!\brief Create an external rate control model callback prototype
@@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
  * the external rate control model.
  *
  * \param[in]  rate_ctrl_model    rate control model
- * \param[in]  encode_frame_info  information of the coding frame
+ * \param[in]  frame_gop_index    index of the frame in current gop
  * \param[out] frame_decision     encode decision of the coding frame
  */
 typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
-    vpx_rc_model_t rate_ctrl_model,
-    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
     vpx_rc_encodeframe_decision_t *frame_decision);
 
 /*!\brief Update encode frame result callback prototype
@@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
  * the external rate control model.
  *
  * \param[in]  rate_ctrl_model  rate control model
- * \param[in]  gop_info         information collected from the encoder
  * \param[out] gop_decision     GOP decision from the model
  */
 typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
-    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
-    vpx_rc_gop_decision_t *gop_decision);
+    vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision);
 
 /*!\brief Get the frame rdmult from the external rate control model.
  *
diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h
index a250aada60..7e4c9ab7e1 100644
--- a/media/libvpx/libvpx/vpx/vpx_tpl.h
+++ b/media/libvpx/libvpx/vpx/vpx_tpl.h
@@ -32,19 +32,21 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/
 
 /*!\brief Temporal dependency model stats for each block before propagation */
 typedef struct VpxTplBlockStats {
-  int16_t row;         /**< Pixel row of the top left corner */
-  int16_t col;         /**< Pixel col of the top left corner */
-  int64_t intra_cost;  /**< Intra cost */
-  int64_t inter_cost;  /**< Inter cost */
-  int16_t mv_r;        /**< Motion vector row */
-  int16_t mv_c;        /**< Motion vector col */
-  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
-  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
-  int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+  int16_t row;            /**< Pixel row of the top left corner */
+  int16_t col;            /**< Pixel col of the top left corner */
+  int64_t intra_cost;     /**< Intra cost */
+  int64_t inter_cost;     /**< Inter cost */
+  int16_t mv_r;           /**< Motion vector row */
+  int16_t mv_c;           /**< Motion vector col */
+  int64_t srcrf_rate;     /**< Rate from source ref frame */
+  int64_t srcrf_dist;     /**< Distortion from source ref frame */
+  int64_t inter_pred_err; /**< Inter prediction error */
+  int64_t intra_pred_err; /**< Intra prediction error */
+  int ref_frame_index;    /**< Ref frame index in the ref frame buffer */
 } VpxTplBlockStats;
 
 /*!\brief Temporal dependency model stats for each frame before propagation */
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 683df5797a..f8b94620d4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
                                                                                \
     if (xoffset == 0) {                                                        \
       if (yoffset == 0) {                                                      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp[w * h];                                                   \
         highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
                                       h);                                      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else {                                                                 \
         uint16_t tmp[w * h];                                                   \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
                                            src_stride, h, yoffset);            \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint16_t tmp0[w * (h + 1)];                                              \
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
                                       (h + 1));                                \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * (h + 1)];                                            \
         highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
                                       (h + 1));                                \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
@@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
       if (yoffset == 0) {                                                      \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
                                            xoffset);                           \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
                                            (h + 1), xoffset);                  \
         highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
         highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
                                            (h + 1), xoffset);                  \
         highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     }                                                                          \
@@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
   } while (--i != 0);
 }
 
-#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
-  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
-      const uint8_t *second_pred) {                                           \
-    uint16_t tmp0[w * (h + 1)];                                               \
-    uint16_t tmp1[w * h];                                                     \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
-                                                                              \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
-                                       xoffset);                              \
-    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
-        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
-                                                                              \
-    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
-        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
   }
 
 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
@@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
       if (yoffset == 0) {                                                      \
         highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
                         CONVERT_TO_SHORTPTR(second_pred));                     \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else if (yoffset == 4) {                                               \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             src_ptr, tmp, source_stride, source_stride, w, h,                  \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       } else {                                                                 \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
@@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
         highbd_avg_pred_var_filter_block2d_avg(                                \
             src_ptr, tmp0, source_stride, 1, w, h,                             \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * (h + 1)];                                            \
@@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                       (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * (h + 1)];                                            \
@@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                       (h + 1));                                \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     } else {                                                                   \
@@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
             CONVERT_TO_SHORTPTR(second_pred));                                 \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
       } else if (yoffset == 4) {                                               \
         uint16_t tmp1[w * h];                                                  \
@@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                            (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_avg(                                \
             tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       } else {                                                                 \
         uint16_t tmp1[w * h];                                                  \
@@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
                                            (h + 1), xoffset);                  \
         highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
             tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
-        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
             CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
       }                                                                        \
     }                                                                          \
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..cebe06b099
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+                                          int src_stride,
+                                          const uint16_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
+  uint64x2_t sse = vdupq_n_u64(0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse = vpx_dotq_u16(sse, diff, diff);
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  return (uint32_t)horizontal_add_uint64x2(sse);
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h)                                      \
+  uint32_t vpx_highbd_10_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    h -= 2;
+  } while (h != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = vld1q_u16(src_ptr);
+    const uint16x8_t r = vld1q_u16(ref_ptr);
+
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s32 = vpadalq_s16(sum_s32, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    const uint16x8_t s0 = vld1q_u16(src_ptr);
+    const uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+    const uint16x8_t r0 = vld1q_u16(ref_ptr);
+    const uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+
+    sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+
+    sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+    sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+  int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                           vdupq_n_s64(0) };
+
+  do {
+    int i = 0;
+    do {
+      const uint16x8_t s0 = vld1q_u16(src_ptr + i);
+      const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8);
+      const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16);
+      const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24);
+
+      const uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16);
+      const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24);
+
+      const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+      const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+      const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+      const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3));
+
+      sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+      sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+      sum_s32[2] = vpadalq_s16(sum_s32[2], diff2);
+      sum_s32[3] = vpadalq_s16(sum_s32[3], diff3);
+
+      sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+      sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+      sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2);
+      sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3);
+
+      i += 32;
+    } while (i < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]);
+
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+  sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_SVE(w, h)                                    \
+  uint32_t vpx_highbd_8_variance##w##x##h##_sve(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_10_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+HBD_VARIANCE_WXH_SVE(4, 4)
+HBD_VARIANCE_WXH_SVE(4, 8)
+
+HBD_VARIANCE_WXH_SVE(8, 4)
+HBD_VARIANCE_WXH_SVE(8, 8)
+HBD_VARIANCE_WXH_SVE(8, 16)
+
+HBD_VARIANCE_WXH_SVE(16, 8)
+HBD_VARIANCE_WXH_SVE(16, 16)
+HBD_VARIANCE_WXH_SVE(16, 32)
+
+HBD_VARIANCE_WXH_SVE(32, 16)
+HBD_VARIANCE_WXH_SVE(32, 32)
+HBD_VARIANCE_WXH_SVE(32, 64)
+
+HBD_VARIANCE_WXH_SVE(64, 32)
+HBD_VARIANCE_WXH_SVE(64, 64)
+
+#define HIGHBD_GET_VAR_SVE(s)                                         \
+  void vpx_highbd_8_get##s##x##s##var_sve(                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
+
+HIGHBD_GET_VAR_SVE(8)
+HIGHBD_GET_VAR_SVE(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 47684473ca..b5a944d299 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -14,86 +14,51 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
-                            int16x4_t *const s0, int16x4_t *const s1,
-                            int16x4_t *const s2, int16x4_t *const s3) {
-  *s0 = vld1_s16(s);
-  s += p;
-  *s1 = vld1_s16(s);
-  s += p;
-  *s2 = vld1_s16(s);
-  s += p;
-  *s3 = vld1_s16(s);
-}
-
-static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
-                            uint16x8_t *const s0, uint16x8_t *const s1,
-                            uint16x8_t *const s2, uint16x8_t *const s3) {
-  *s0 = vld1q_u16(s);
-  s += p;
-  *s1 = vld1q_u16(s);
-  s += p;
-  *s2 = vld1q_u16(s);
-  s += p;
-  *s3 = vld1q_u16(s);
-}
-
-static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
-                            int16x8_t *const s0, int16x8_t *const s1,
-                            int16x8_t *const s2, int16x8_t *const s3,
-                            int16x8_t *const s4, int16x8_t *const s5,
-                            int16x8_t *const s6, int16x8_t *const s7) {
-  *s0 = vld1q_s16(s);
-  s += p;
-  *s1 = vld1q_s16(s);
-  s += p;
-  *s2 = vld1q_s16(s);
-  s += p;
-  *s3 = vld1q_s16(s);
-  s += p;
-  *s4 = vld1q_s16(s);
-  s += p;
-  *s5 = vld1q_s16(s);
-  s += p;
-  *s6 = vld1q_s16(s);
-  s += p;
-  *s7 = vld1q_s16(s);
+static INLINE uint16x4_t highbd_convolve4_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filters, 0);
+  sum = vmlal_lane_s16(sum, s1, filters, 1);
+  sum = vmlal_lane_s16(sum, s2, filters, 2);
+  sum = vmlal_lane_s16(sum, s3, filters, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
-static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
-                             const uint16x8_t s0, const uint16x8_t s1,
-                             const uint16x8_t s2, const uint16x8_t s3,
-                             const uint16x8_t s4, const uint16x8_t s5,
-                             const uint16x8_t s6, const uint16x8_t s7) {
-  vst1q_u16(s, s0);
-  s += p;
-  vst1q_u16(s, s1);
-  s += p;
-  vst1q_u16(s, s2);
-  s += p;
-  vst1q_u16(s, s3);
-  s += p;
-  vst1q_u16(s, s4);
-  s += p;
-  vst1q_u16(s, s5);
-  s += p;
-  vst1q_u16(s, s6);
-  s += p;
-  vst1q_u16(s, s7);
+static INLINE uint16x8_t highbd_convolve4_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
-static INLINE int32x4_t highbd_convolve8_4(
-    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
-    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
-    const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filters, const uint16x4_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum;
 
-  sum = vmull_lane_s16(s0, filters_lo, 0);
+  int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4(
   sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
-  return sum;
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
 static INLINE uint16x8_t
@@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
                    const int16x8_t filters, const uint16x8_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum0, sum1;
-  uint16x8_t d;
 
-  sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
-  sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
-  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
-  d = vminq_u16(d, max);
-  return d;
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filter, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filter, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filter, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
 }
 
 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
@@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
   if (x_step_q4 != 16) {
     vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-  } else {
-    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
-
-      do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+    return;
+  }
 
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        transpose_u16_4x4q(&d01, &d23);
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
 
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          transpose_u16_8x4(&d0, &d1, &d2, &d3);
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
-                                    max);
-            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
-                                    max);
-            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
-                                    max);
-            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
-                                    max);
-            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
-                                    max);
-            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
-                                    max);
-            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
-                                    max);
-            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
-                                    filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                                    x_filter_4tap, bd);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                                    x_filter_8tap, bd);
   }
 }
 
@@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
     vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
                                      bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_4tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t t0, t1, t2, t3;
-      uint16x8_t d01, d23, t01, t23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
 
       do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
-        transpose_u16_4x4q(&t01, &t23);
-
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 2 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
-
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max);
+        uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max);
+        uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max);
+        uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-          transpose_u16_8x4(&t0, &t1, &t2, &t3);
-
-          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                            vld1_u16(dst + 4 * dst_stride));
-          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                            vld1_u16(dst + 5 * dst_stride));
-          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                            vld1_u16(dst + 6 * dst_stride));
-          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
-                            vld1_u16(dst + 7 * dst_stride));
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
-                                    max);
-            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
-                                    max);
-            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
-                                    max);
-            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
-                                    max);
-            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
-                                    max);
-            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
-                                    max);
-            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
-                                    max);
-            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
-                                    filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
-            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
-            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
-            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
-            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
-            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
-            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
-            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
-
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
   if (y_step_q4 != 16) {
     vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
-  } else {
-    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3 * src_stride;
-
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+    return;
+  }
 
-      do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
 
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
+    highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_4tap, bd);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_8tap, bd);
   }
 }
 
@@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
     vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
                                     bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3 * src_stride;
-
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23, t01, t23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      s += 7 * src_stride;
 
       do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
-
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 1 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
-
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
 
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 =
-              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          d0 = vld1q_u16(d + 0 * dst_stride);
-          d1 = vld1q_u16(d + 1 * dst_stride);
-          d2 = vld1q_u16(d + 2 * dst_stride);
-          d3 = vld1q_u16(d + 3 * dst_stride);
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+static INLINE void highbd_convolve_2d_4tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+    const int16x4_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE void highbd_convolve_2d_8tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
+    const int16x8_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+    highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
+                                 dst, dst_stride, w, h, x_filter, y_filter, bd);
+    return;
+  }
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter, y_filter, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
   }
+
+  // Averaging convolution always uses an 8-tap filter.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  src = src - horiz_offset - vert_offset;
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+      d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+      d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+      d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
new file mode 100644
index 0000000000..7fc0a57c90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+                                                               1, 3, 5, 7 };
+
+static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x4_t max) {
+  int16x8_t s01 = vcombine_s16(s[0], s[1]);
+  int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0);
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x8_t max,
+                                            uint16x8_t idx) {
+  int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+  int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+  int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+
+  res = vpx_tbl_u16(res, idx);
+
+  return vminq_u16(res, max);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x4_t max) {
+  int64x2_t sum[4];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8],
+                                            const int16x8_t filter,
+                                            const uint16x8_t max) {
+  int64x2_t sum[8];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+  sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter);
+  sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter);
+  sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter);
+  sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter);
+
+  int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
+  int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
+  int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
+  int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) {
+  const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4(s0, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4(s1, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4(s2, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4(s3, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx);
+        uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx);
+        uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx);
+        uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h,
+                                   x_filter_4tap, bd);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h,
+                                   x_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+    return;
+  }
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
new file mode 100644
index 0000000000..4ed7718f7d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
@@ -0,0 +1,452 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 0, 5, 6, 7, 4,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 0, 1, 6, 7, 4, 5,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1,
+                                        const int16x4_t s2, const int16x4_t s3,
+                                        int16x8_t res[2]) {
+  // Transpose 16-bit elements:
+  // s0: 00, 01, 02, 03
+  // s1: 10, 11, 12, 13
+  // s2: 20, 21, 22, 23
+  // s3: 30, 31, 32, 33
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+
+  int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+  int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+  int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+  int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+  int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+  int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
+
+  int32x4x2_t t0123 = vzipq_s32(s01, s23);
+
+  res[0] = vreinterpretq_s16_s32(t0123.val[0]);
+  res[1] = vreinterpretq_s16_s32(t0123.val[1]);
+}
+
+static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1,
+                                        const int16x8_t s2, const int16x8_t s3,
+                                        int16x8_t res[4]) {
+  // Transpose 16-bit elements:
+  // s0: 00, 01, 02, 03, 04, 05, 06, 07
+  // s1: 10, 11, 12, 13, 14, 15, 16, 17
+  // s2: 20, 21, 22, 23, 24, 25, 26, 27
+  // s3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // res[0]: 00 10 20 30 01 11 21 31
+  // res[1]: 02 12 22 32 03 13 23 33
+  // res[2]: 04 14 24 34 05 15 25 35
+  // res[3]: 06 16 26 36 07 17 27 37
+
+  int16x8x2_t s01 = vzipq_s16(s0, s1);
+  int16x8x2_t s23 = vzipq_s16(s2, s3);
+
+  int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]),
+                                   vreinterpretq_s32_s16(s23.val[0]));
+  int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]),
+                                   vreinterpretq_s32_s16(s23.val[1]));
+
+  res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]);
+  res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]);
+  res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]);
+  res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]);
+}
+
+static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
+                                  int16x8_t res[4], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+  res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
+  res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
+}
+
+static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
+                                  int16x8_t res[2], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
+                                              int16x8_t s_hi[2],
+                                              int16x8_t filter,
+                                              uint16x4_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
+                                              const int16x8_t s_hi[4],
+                                              const int16x8_t filter,
+                                              const uint16x8_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
+  sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
+
+  int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
+  sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+                                vqrshrun_n_s32(sum4567, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_8tap_vert_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  assert(w >= 4 && h >= 4);
+  uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+  // Correct indices by the size of vector length.
+  merge_tbl_idx.val[0] = vaddq_u16(
+      merge_tbl_idx.val[0],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+  merge_tbl_idx.val[1] = vaddq_u16(
+      merge_tbl_idx.val[1],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+  merge_tbl_idx.val[2] = vaddq_u16(
+      merge_tbl_idx.val[2],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+      vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+      vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+      vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, sA;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+        int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+        transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+        vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+        vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+        vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max);
+        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max);
+        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max);
+        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                   bd);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+  // Correct indices by the size of vector length.
+  merge_tbl_idx.val[0] = vaddq_u16(
+      merge_tbl_idx.val[0],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+  merge_tbl_idx.val[1] = vaddq_u16(
+      merge_tbl_idx.val[1],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+  merge_tbl_idx.val[2] = vaddq_u16(
+      merge_tbl_idx.val[2],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_4x4(s0, s1, s2, s3, s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+      vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+      vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+      vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_8x4(s0, s1, s2, s3, s0123);
+      transpose_concat_8x4(s1, s2, s3, s4, s1234);
+      transpose_concat_8x4(s2, s3, s4, s5, s2345);
+      transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+      do {
+        int16x8_t s7, s8, s9, sA;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+        int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+        transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+        vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+        vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+        vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
deleted file mode 100644
index 414ade3530..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
-                               uint16_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *filter, int x0_q4,
-                               int x_step_q4, int y0_q4, int y_step_q4, int w,
-                               int h, int bd) {
-  // + 1 to make it divisible by 4
-  uint16_t temp[64 * 136];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  intermediate_height, bd);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
-                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
-                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *filter, int x0_q4,
-                                   int x_step_q4, int y0_q4, int y_step_q4,
-                                   int w, int h, int bd) {
-  // + 1 to make it divisible by 4
-  uint16_t temp[64 * 136];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  intermediate_height, bd);
-  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
-                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
-                                     bd);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
index c54e588239..579096d78a 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_)  // flip_sign_16
 
 #define FUN_FLIP_SIGN_BACK(w, r)                                         \
   static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
-    const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
+    const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80);           \
     return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
   }
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
index 38b0b6c1a9..268c4bd962 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
   uint32_t a;
-  uint32x2_t a_u32;
-  if (stride == 4) return vld1_u8(buf);
+  uint32x2_t a_u32 = vdup_n_u32(0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vdup_n_u32(a);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
@@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
 static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
                                              ptrdiff_t stride) {
   uint64_t a;
-  uint64x2_t a_u64;
-  if (stride == 4) return vld1q_u16(buf);
+  uint64x2_t a_u64 = vdupq_n_u64(0);
   memcpy(&a, buf, 8);
   buf += stride;
-  a_u64 = vdupq_n_u64(a);
+  a_u64 = vsetq_lane_u64(a, a_u64, 0);
   memcpy(&a, buf, 8);
   a_u64 = vsetq_lane_u64(a, a_u64, 1);
   return vreinterpretq_u16_u64(a_u64);
@@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
-  if (stride == 4) {
-    vst1_u8(buf, a);
-    return;
-  }
   uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
@@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
                                             ptrdiff_t stride) {
   uint32_t a;
-  uint32x4_t a_u32;
-  if (stride == 4) return vld1q_u8(buf);
+  uint32x4_t a_u32 = vdupq_n_u32(0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vdupq_n_u32(a);
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
 static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
-  if (stride == 4) {
-    vst1q_u8(buf, a);
-    return;
-  }
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
   buf += stride;
   uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
@@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
   vst1q_u8(s, s7);
 }
 
+static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+}
+
+static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+  s += p;
+  vst1_u16(s, s3);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+}
+
 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
   *s7 = vld1q_u16(s);
 }
 
+static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6, int16x4_t *s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6, int16x8_t *s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..a18cbbd736
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) {
+  if (size == 4) {
+    int16x4_t s[4];
+    int64x2_t sum = vdupq_n_s64(0);
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+
+    int16x8_t s01 = vcombine_s16(s[0], s[1]);
+    int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+    sum = vpx_dotq_s16(sum, s01, s01);
+    sum = vpx_dotq_s16(sum, s23, s23);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum));
+  } else {
+    int rows = size;
+    int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                         vdupq_n_s64(0) };
+
+    do {
+      const int16_t *src_ptr = src;
+      int cols = size;
+
+      do {
+        int16x8_t s[8];
+        load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                     &s[6], &s[7]);
+
+        sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]);
+        sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]);
+        sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]);
+        sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]);
+        sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]);
+        sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]);
+        sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]);
+        sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]);
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      src += 8 * stride;
+      rows -= 8;
+    } while (rows);
+
+    sum[0] = vaddq_s64(sum[0], sum[1]);
+    sum[2] = vaddq_s64(sum[2], sum[3]);
+    sum[0] = vaddq_s64(sum[0], sum[2]);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0]));
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 74f85a6bb6..c989a6721b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
   *a7 = vreinterpretq_s32_s64(c3.val[1]);
 }
 
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
                                     uint8x8_t *a6, uint8x8_t *a7) {
-  // Swap 8 bit elements. Goes from:
+  // Widen to 128-bit registers (usually a no-op once inlined.)
+  const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+  const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+  const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+  const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+  const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+  const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+  const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+  const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+  // Zip 8 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
   // a2: 20 21 22 23 24 25 26 27
@@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   // a6: 60 61 62 63 64 65 66 67
   // a7: 70 71 72 73 74 75 76 77
   // to:
-  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
-  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
-  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
-  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
-
-  const uint8x16x2_t b0 =
-      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
-  const uint8x16x2_t b1 =
-      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
-
-  // Swap 16 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
-  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
-  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
-  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
-
-  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
-                                    vreinterpretq_u16_u8(b1.val[0]));
-  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
-                                    vreinterpretq_u16_u8(b1.val[1]));
-
-  // Unzip 32 bit elements resulting in:
+  // b0: 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // b1: 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // b2: 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // b3: 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+  const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+  const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+  const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
+
+  // Zip 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // c0.val[1]: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // c1.val[0]: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // c1.val[1]: 44 54 64 74 45 55 65 75  46 66 56 76 47 67 57 77
+  const uint16x8x2_t c0 =
+      vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+  const uint16x8x2_t c1 =
+      vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
+
+  // Zip 32 bit elements resulting in:
   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d0.val[1]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[0]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+  const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
                                     vreinterpretq_u32_u16(c1.val[0]));
-  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+  const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
                                     vreinterpretq_u32_u16(c1.val[1]));
 
   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
-  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
-  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
-  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
-  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 65fb67c984..037ea1142d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -20,44 +20,36 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-// Note:
-// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
-// 2. After refactoring the shared code in kernel loops with inline functions,
-// the decoder speed dropped a lot when using gcc compiler. Therefore there is
-// no refactoring for those parts by now.
-// 3. For horizontal convolve, there is an alternative optimization that
-// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
-// samples in each are read from memory: src, (src+1), (src+2), (src+3),
-// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
-// instructions. This optimization is much faster in speed unit test, but slowed
-// down the whole decoder by 5%.
-
-static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
-                                                ptrdiff_t src_stride,
-                                                uint8_t *dst,
-                                                ptrdiff_t dst_stride, int w,
-                                                int h, const int16x4_t filter) {
+static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t x_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
+                                     vdup_lane_u8(x_filter, 3),
+                                     vdup_lane_u8(x_filter, 4),
+                                     vdup_lane_u8(x_filter, 5) };
+
   if (w == 4) {
     do {
-      int16x4_t s0[4], s1[4];
-
-      int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
-      s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
-      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
-      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
-      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+      uint8x8_t s01[4];
 
-      int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
-      s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
-      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
-      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
-      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+      s01[0] = load_unaligned_u8(src + 0, src_stride);
+      s01[1] = load_unaligned_u8(src + 1, src_stride);
+      s01[2] = load_unaligned_u8(src + 2, src_stride);
+      s01[3] = load_unaligned_u8(src + 3, src_stride);
 
-      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
-      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
 
-      store_u8(dst, dst_stride, d01);
+      store_unaligned_u8(dst, dst_stride, d01);
 
       src += 2 * src_stride;
       dst += 2 * dst_stride;
@@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
       int width = w;
 
       do {
-        int16x8_t t0[2], t1[2];
-        int16x8_t s0[4], s1[4];
-
-        t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
-        s0[0] = vextq_s16(t0[0], t0[1], 0);
-        s0[1] = vextq_s16(t0[0], t0[1], 1);
-        s0[2] = vextq_s16(t0[0], t0[1], 2);
-        s0[3] = vextq_s16(t0[0], t0[1], 3);
-
-        t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
-        t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
-        s1[0] = vextq_s16(t1[0], t1[1], 0);
-        s1[1] = vextq_s16(t1[0], t1[1], 1);
-        s1[2] = vextq_s16(t1[0], t1[1], 2);
-        s1[3] = vextq_s16(t1[0], t1[1], 3);
-
-        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
-        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+        uint8x8_t s0[4], s1[4];
+
+        s0[0] = vld1_u8(s + 0);
+        s0[1] = vld1_u8(s + 1);
+        s0[2] = vld1_u8(s + 2);
+        s0[3] = vld1_u8(s + 3);
+
+        s1[0] = vld1_u8(s + src_stride + 0);
+        s1[1] = vld1_u8(s + src_stride + 1);
+        s1[2] = vld1_u8(s + src_stride + 2);
+        s1[3] = vld1_u8(s + src_stride + 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
 
         vst1_u8(d, d0);
         vst1_u8(d + dst_stride, d1);
@@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
-                                                ptrdiff_t src_stride,
-                                                uint8_t *dst,
-                                                ptrdiff_t dst_stride, int w,
-                                                int h, const int16x8_t filter) {
-  uint8x8_t t0, t1, t2, t3;
-
+static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
   if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       transpose_u8_4x4(&d01, &d23);
 
@@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
       w -= 4;
     } while (w != 0);
   } else {
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     if (w == 4) {
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+
         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
@@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
+        src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
-      uint8_t *d;
-      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t s11, s12, s13, s14;
-
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
+
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   (void)y0_q4;
   (void)y_step_q4;
 
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements.
-     */
-    const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
-    vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
-                                 x_filter_4tap);
+    convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                             x_filter);
   } else {
-    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
-    vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
-                                 x_filter_8tap);
+    convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                             x_filter);
   }
 }
 
@@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-  uint8x8_t t0, t1, t2, t3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23, dd01, dd23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
+    uint8x8_t t0, t1, t2, t3;
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       transpose_u8_4x4(&d01, &d23);
 
-      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
-      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     if (w == 4) {
-      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
-
       do {
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
                     &t7);
-        src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+
         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&d04, &d15, &d26, &d37);
 
-        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
-        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
-        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
-        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
 
         d04 = vrhadd_u8(d04, dd04);
         d15 = vrhadd_u8(d15, dd15);
@@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
         store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
+        src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
       } while (h != 0);
     } else {
-      uint8_t *d;
-      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t s11, s12, s13, s14;
-
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
+
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          uint8x8_t d5 =
+              convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
-                                               ptrdiff_t src_stride,
-                                               uint8_t *dst,
-                                               ptrdiff_t dst_stride, int w,
-                                               int h, const int16x4_t filter) {
-  if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
-
-    load_u8_8x3(src, src_stride, &t0, &t1, &t2);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-
-    src += 3 * src_stride;
-
-    do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve4_4(s0, s1, s2, s3, filter);
-      d1 = convolve4_4(s1, s2, s3, s4, filter);
-      d2 = convolve4_4(s2, s3, s4, s5, filter);
-      d3 = convolve4_4(s3, s4, s5, s6, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6;
-
-    do {
-      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-
-      s = src + 3 * src_stride;
-      d = dst;
-      height = h;
-
-      do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve4_8(s0, s1, s2, s3, filter);
-        d1 = convolve4_8(s1, s2, s3, s4, filter);
-        d2 = convolve4_8(s2, s3, s4, s5, filter);
-        d3 = convolve4_8(s3, s4, s5, s6, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
-}
-
-static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
-                                               ptrdiff_t src_stride,
-                                               uint8_t *dst,
-                                               ptrdiff_t dst_stride, int w,
-                                               int h, const int16x8_t filter) {
+static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
   if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
 
     src += 7 * src_stride;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s = src + 7 * src_stride;
-      d = dst;
-      height = h;
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
 
       do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   (void)x_step_q4;
   (void)y_step_q4;
 
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements.
-     */
-    const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
-    vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
-                                w, h, y_filter_4tap);
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
-    vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
-                                dst_stride, w, h, y_filter_8tap);
+    convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
+                            w, h, y_filter);
   }
 }
 
@@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
 
     src += 7 * src_stride;
 
     do {
-      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
     do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      s = src + 7 * src_stride;
-      d = dst;
-      height = h;
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
 
       do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
         d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 4ecaee0f99..10cc761ccd 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,360 +17,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *filter, int x0_q4,
-                                         int x_step_q4, int y0_q4,
-                                         int y_step_q4, int w, int h);
-
-static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16_t permute_tbl) {
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  int8x16_t clamped_samples =
-      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
-                                                 const int8x16_t samples_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
-  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x2_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[2];
-  int32x4_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
-                                                 const int8x16_t samples0_hi,
-                                                 const int8x16_t samples1_lo,
-                                                 const int8x16_t samples1_hi,
-                                                 const int32x4_t correction,
-                                                 const int8x8_t filters) {
-  /* Sample range-clamping and permutation are performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
-  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
-  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
-                                         const int8x8_t filters,
-                                         const int32x4_t correction,
-                                         const uint8x16_t range_limit,
-                                         const uint8x16x3_t permute_tbl) {
-  int8x16_t clamped_samples, permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
-  /* Accumulate dot product into 'correction' to account for range clamp. */
-  /* First 4 output values. */
-  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
-  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h);
-
-static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
-                                                  const int8x8_t filters) {
-  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16_t permute_tbl) {
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
-
-  int32x4_t sum =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  /* First 4 output values. */
-  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  /* First 4 output values. */
-  int32x4_t sum0 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  /* Second 4 output values. */
-  int32x4_t sum1 =
-      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-
-  /* Narrow and re-pack. */
-  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
-                                                  const uint8x16_t samples_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum;
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
-  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x2_t permute_tbl) {
-  uint8x16_t permuted_samples[2];
-  int32x4_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
-  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
-  /* Further narrowing and packing is performed by the caller. */
-  return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
-                                                  const uint8x16_t samples0_hi,
-                                                  const uint8x16_t samples1_lo,
-                                                  const uint8x16_t samples1_hi,
-                                                  const int8x8_t filters) {
-  /* Sample permutation is performed by the caller. */
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
-  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
-  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
-                                          const int8x8_t filters,
-                                          const uint8x16x3_t permute_tbl) {
-  uint8x16_t permuted_samples[3];
-  int32x4_t sum0, sum1;
-  int16x8_t sum;
-
-  /* Permute samples ready for dot product. */
-  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
-  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
-  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
-  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
-  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
-  /* First 4 output values. */
-  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
-  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
-  /* Second 4 output values. */
-  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
-  /* Narrow and re-pack. */
-  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t filters) {
-  int16x4_t sum = vmul_lane_s16(s0, filters, 0);
-  sum = vmla_lane_s16(sum, s1, filters, 1);
-  sum = vmla_lane_s16(sum, s2, filters, 2);
-  sum = vmla_lane_s16(sum, s3, filters, 3);
-  return sum;
-}
-
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x4_t filters) {
-  int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters, 2);
-  sum = vmlaq_lane_s16(sum, s3, filters, 3);
-  /* We halved the filter values so -1 from right shift. */
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
@@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
                      filters);
 }
 
+// 2-tap (bilinear) filter values are always positive, but 4-tap filter values
+// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much
+// greater positive values to compensate. To use instructions that operate on
+// 8-bit types we also need the types to be unsigned. Subtracting the products
+// of taps 0 and 3 from the products of taps 1 and 2 always works given that
+// 2-tap filters are 0-padded.
+static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1,
+                                    const uint8x8_t s2, const uint8x8_t s3,
+                                    const uint8x8_t filter_taps[4]) {
+  uint16x8_t sum = vmull_u8(s1, filter_taps[1]);
+  sum = vmlal_u8(sum, s2, filter_taps[2]);
+  sum = vmlsl_u8(sum, s0, filter_taps[0]);
+  sum = vmlsl_u8(sum, s3, filter_taps[3]);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t y_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                     vdup_lane_u8(y_filter, 3),
+                                     vdup_lane_u8(y_filter, 4),
+                                     vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t s0, s1, s2;
+      load_u8_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x8_t s3, s4, s5, s6;
+        load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 00bac3b9cf..b05a49d3fe 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -20,270 +20,139 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
+// Filter values always sum to 128.
+#define FILTER_SUM 128
+
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-    d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-    d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
 
-      d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
 
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
 
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-  }
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
 }
 
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-    d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-    d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width != 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-      d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-      d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width != 0);
-  }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *filter, int x0_q4,
-                                         int x_step_q4, int y0_q4,
-                                         int y_step_q4, int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
-                                            dst_stride, w, h, x_filter_4tap,
-                                            correction_4tap, range_limit);
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-  } else {
-    vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
-                                            dst_stride, w, h, x_filter_8tap,
-                                            correction_8tap, range_limit);
-  }
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+static INLINE void convolve_4tap_horiz_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+static INLINE void convolve_8tap_horiz_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x16_t range_limit) {
-  uint8x16_t s0, s1, s2, s3;
-
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                       const InterpKernel *filter, int x0_q4,
                                       int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
@@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
     const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
-                                         w, h, x_filter_4tap, correction_4tap,
-                                         range_limit);
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
+                                     x_filter_4tap);
 
   } else {
-    vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
-                                         w, h, x_filter_8tap, correction_8tap,
-                                         range_limit);
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
+                                     x_filter_8tap);
   }
 }
 
@@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
                                           int x_step_q4, int y0_q4,
                                           int y_step_q4, int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
-        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
-        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
-        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
 
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
 }
 
 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
+                                        int8x8_t a3, int8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+  *b = vreinterpretq_s8_s16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
                                         int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+                                        int8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+  int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+  int16x8x2_t a0123 =
+      vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+  *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+  *b1 = vreinterpretq_s8_s16(a0123.val[1]);
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x8_t range_limit) {
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+                                      const int8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-    do {
-      uint8x8_t t7, t8, t9, t10;
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      d0 = convolve4_4_sdot_partial(s0123, correction, filter);
-      d1 = convolve4_4_sdot_partial(s1234, correction, filter);
-      d2 = convolve4_4_sdot_partial(s2345, correction, filter);
-      d3 = convolve4_4_sdot_partial(s3456, correction, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
-        d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
-        d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
-        d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_lo = s78910_lo;
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_hi = s78910_hi;
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+                                      const int8x16_t samples0_hi,
+                                      const int8x16_t samples1_lo,
+                                      const int8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+static INLINE void convolve_8tap_vert_neon_dotprod(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
-    const int32x4_t correction, const uint8x8_t range_limit) {
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t t7, t8, t9, t10;
-
       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      int8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
       s += 7 * src_stride;
 
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t t7, t8, t9, t10;
-
         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filter);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filter);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filter);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filter);
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
                                      const InterpKernel *filter, int x0_q4,
                                      int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h) {
-  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int32x4_t correction_8tap =
-      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
@@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t y_filter_4tap =
-        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
-    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
-    vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
-                                        dst_stride, w, h, y_filter_4tap,
-                                        correction_4tap, range_limit);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
-                                        dst_stride, w, h, y_filter_8tap,
-                                        correction_8tap, range_limit);
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter);
   }
 }
 
@@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
                                          int x_step_q4, int y0_q4,
                                          int y_step_q4, int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
     src += 7 * src_stride;
 
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
       uint8x8_t t7, t8, t9, t10;
-
       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      int8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
       s += 7 * src_stride;
 
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t t7, t8, t9, t10;
-
         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
     } while (w != 0);
   }
 }
+
+static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t x_filter,
+                                                 const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
+                                  dst, dst_stride, w, h, x_filter_4tap,
+                                  y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
+                                  y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
+                                      dst, dst_stride, filter, x0_q4, x_step_q4,
+                                      y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index bcad1dd121..e582004133 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
+  // Shift left and insert new last column in transposed 4x4 block.
   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
+  // Shift left and insert two new columns in transposed 4x4 block.
   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
+  // Shift left and insert three new columns in transposed 4x4 block.
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
 
-  if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
 
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve4_4_usdot(s0, filter, perm_tbl);
-      d1 = convolve4_4_usdot(s1, filter, perm_tbl);
-      d2 = convolve4_4_usdot(s2, filter, perm_tbl);
-      d3 = convolve4_4_usdot(s3, filter, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve4_4_usdot(s0, filter, perm_tbl);
-    d1 = convolve4_4_usdot(s1, filter, perm_tbl);
-    d2 = convolve4_4_usdot(s2, filter, perm_tbl);
-    /* We halved the filter values so -1 from right shift. */
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-      d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-      d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width > 0);
-  }
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
 }
 
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
-
-  if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    do {
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      d0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      d1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      d2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      d3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
-    d0 = convolve8_4_usdot(s0, filter, perm_tbl);
-    d1 = convolve8_4_usdot(s1, filter, perm_tbl);
-    d2 = convolve8_4_usdot(s2, filter, perm_tbl);
-    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
-    store_u8(dst + 0 * dst_stride, dst_stride, d01);
-    store_u8_4x1(dst + 2 * dst_stride, d23);
-  } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 3);
-
-    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
-     * further details on possible values of block height. */
-    width = w;
-    s = src;
-    d = dst;
-    do {
-      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
-      d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-      d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-      d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-
-      store_u8_8x3(d, dst_stride, d0, d1, d2);
-
-      s += 8;
-      d += 8;
-      width -= 8;
-    } while (width > 0);
-  }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *filter, int x0_q4,
-                                      int x_step_q4, int y0_q4, int y_step_q4,
-                                      int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
-                                         w, h, x_filter_4tap);
-
-  } else {
-    vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
-                                         w, h, x_filter_8tap);
-  }
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
 }
 
-static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
 
+static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve4_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve4_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve4_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve4_4_usdot(s3, filter, perm_tbl);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
   }
 }
 
-static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  uint8x16_t s0, s1, s2, s3;
-
+static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
@@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
-  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
@@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
     const int8x8_t x_filter_4tap =
-        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
-                                      h, x_filter_4tap);
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h,
+                                  x_filter_4tap);
 
   } else {
-    vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
-                                      h, x_filter_8tap);
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h,
+                                  x_filter_8tap);
   }
 }
 
@@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                        int x_step_q4, int y0_q4, int y_step_q4,
                                        int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (w == 4) {
-    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int16x4_t t0, t1, t2, t3;
-      uint8x8_t d01, d23, dd01, dd23;
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
 
+    do {
+      uint8x16_t s0, s1, s2, s3;
       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
-      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
-      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
-      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
-      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
-      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
 
     do {
-      width = w;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
       do {
+        uint8x16_t s0, s1, s2, s3;
         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
-        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
-        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
-        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
 
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
 
 static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
+                                        uint8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+  *b = vreinterpretq_u8_u16(a0123);
 }
 
 static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
                                         uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+                                        uint8x16_t *b0, uint8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+  uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+  uint16x8x2_t a0123 =
+      vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+  *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+  *b1 = vreinterpretq_u8_u16(a0123.val[1]);
 }
 
-static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      d0 = convolve4_4_usdot_partial(s0123, filter);
-      d1 = convolve4_4_usdot_partial(s1234, filter);
-      d2 = convolve4_4_usdot_partial(s2345, filter);
-      d3 = convolve4_4_usdot_partial(s3456, filter);
-      /* We halved the filter values so -1 from right shift. */
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h != 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+                                      const uint8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
-        d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
-        d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
-        d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_lo = s78910_lo;
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-        s3456_hi = s78910_hi;
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
 
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height != 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w != 0);
-  }
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+                                      const uint8x16_t samples0_hi,
+                                      const uint8x16_t samples1_lo,
+                                      const uint8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+
+  // First 4 output values.
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
-static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int8x8_t filter) {
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
+      uint8x8_t s7, s8, s9, s10;
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      uint8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
+        uint8x8_t s7, s8, s9, s10;
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filter);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filter);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filter);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filter);
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
 
         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
         s0123_lo = s4567_lo;
         s0123_hi = s4567_hi;
         s1234_lo = s5678_lo;
@@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                   const InterpKernel *filter, int x0_q4,
                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
-  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
@@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
-    /* All 4-tap and bilinear filter values are even, so halve them to reduce
-     * intermediate precision requirements. Also slide the filter values so the
-     * the 4 taps exist in the first 4 elements of the vector.
-     */
-    const int8x8_t y_filter_4tap =
-        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
-    vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
-                                     dst_stride, w, h, y_filter_4tap);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
-                                     dst_stride, w, h, y_filter_8tap);
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+                                 dst_stride, w, h, y_filter);
   }
 }
 
@@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
                                       int w, int h) {
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
 
   assert((intptr_t)dst % 4 == 0);
   assert(dst_stride % 4 == 0);
@@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int16x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
     src += 7 * src_stride;
 
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
+      uint8x8_t s7, s8, s9, s10;
       load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
 
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+      uint8x16_t s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
 
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
@@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       store_u8(dst + 0 * dst_stride, dst_stride, d01);
       store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
       s0123 = s4567;
       s1234 = s5678;
       s2345 = s6789;
@@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
     do {
-      height = h;
-      s = src;
-      d = dst;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
 
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
       load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
       s += 7 * src_stride;
 
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
+        uint8x8_t s7, s8, s9, s10;
         load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
 
         samples_LUT.val[0] = s3456_hi;
         samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
 
         d0 = vrhadd_u8(d0, dd0);
@@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
+
+static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src,
+                                              ptrdiff_t src_stride,
+                                              uint8_t *dst,
+                                              ptrdiff_t dst_stride, int w,
+                                              int h, const int8x8_t x_filter,
+                                              const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter_4tap,
+                               y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                                   im_block, im_stride, w, im_height,
+                                   x_filter_8tap);
+
+  convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h,
+                               y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride,
+                                   src_stride, im_block, im_stride, w,
+                                   im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride,
+                                   dst, dst_stride, filter, x0_q4, x_step_q4,
+                                   y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 57772ea668..de5fa29471 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  uint8_t temp[64 * 72];
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
 
   const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
-  const int intermediate_height = h + vert_filter_taps;
+  // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+  // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.)
+  const int im_height = h + vert_filter_taps;
   const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting border_offset lines back. The Neon implementation will
-   * ignore the given height and filter a multiple of 4 lines. Since this goes
-   * in to the temp buffer which has lots of extra room and is subsequently
-   * discarded this is safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
-                           w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                           intermediate_height);
+  // Filter starting border_offset rows back. The Neon implementation will
+  // ignore the given height and filter a multiple of 4 lines. Since this goes
+  // into the temporary buffer which has lots of extra room and is subsequently
+  // discarded this is safe if somewhat less than ideal.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
 
-  /* Step into the temp buffer border_offset lines to get actual frame data. */
-  vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
-                          x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+  // Step into the temporary buffer border_offset rows to get actual frame data.
+  vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+                          dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                          y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             const InterpKernel *filter, int x0_q4,
                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  uint8_t temp[64 * 72];
-  const int intermediate_height = h + 8;
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS;
+  const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                           intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                              x_step_q4, y0_q4, y_step_q4, w, h);
+  // This implementation has the same issues as above. In addition, we only want
+  // to average the values after both passes.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
+
+  vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride,
+                              dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                              y_step_q4, w, h);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
deleted file mode 100644
index 9d754fde17..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const InterpKernel *filter, int x0_q4,
-                                int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * (64 + 7). */
-  uint8_t temp[64 * 71];
-
-  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. */
-  const int intermediate_height = h + vert_filter_taps - 1;
-  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_dotprod(
-      src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
-      x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
-                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                                  h);
-}
-
-void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *filter, int x0_q4,
-                                    int x_step_q4, int y0_q4, int y_step_q4,
-                                    int w, int h) {
-  uint8_t temp[64 * 71];
-
-  /* Averaging convolution always uses an 8-tap filter. */
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
-                                      filter, x0_q4, x_step_q4, y0_q4,
-                                      y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
-                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
deleted file mode 100644
index d7cbb09ea6..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
-   * maximum buffer size to 64 * (64 + 7). */
-  uint8_t temp[64 * 71];
-
-  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
-  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
-   * and vert_filter_taps / 2 lines post. */
-  const int intermediate_height = h + vert_filter_taps - 1;
-  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
-                                   temp, w, filter, x0_q4, x_step_q4, y0_q4,
-                                   y_step_q4, w, intermediate_height);
-
-  vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
-                               filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
-                               h);
-}
-
-void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  uint8_t temp[64 * 71];
-
-  /* Averaging convolution always uses an 8-tap filter. */
-  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
-  const int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
-                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
-                                   w, intermediate_height);
-
-  vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
-                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
new file mode 100644
index 0000000000..bf9f18c7e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Some very useful instructions are exclusive to the SVE2 instruction set.
+// However, we can access these instructions from a predominantly Neon context
+// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors
+// as SVE vectors - with the high part of the SVE vector (if it's longer than
+// 128 bits) being "don't care".
+
+static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1,
+                                     uint16x8_t tbl) {
+  svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+                                      svset_neonq_s16(svundef_s16(), s1));
+  return svget_neonq_s16(
+      svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
new file mode 100644
index 0000000000..48534fb70e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x,
+                                      uint16x8_t y) {
+  return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                   svset_neonq_u16(svundef_u16(), x),
+                                   svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+  return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                   svset_neonq_s16(svundef_s16(), x),
+                                   svset_neonq_s16(svundef_s16(), y)));
+}
+
+#define vpx_dotq_lane_s16(acc, x, y, lane)                            \
+  svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \
+                                 svset_neonq_s16(svundef_s16(), x),   \
+                                 svset_neonq_s16(svundef_s16(), y), lane))
+
+static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) {
+  return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data),
+                                   svset_neonq_u16(svundef_u16(), indices)));
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index b8e3c5e540..9bd5ec285c 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -20,263 +20,271 @@
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void scaledconvolve_horiz_w4(
+static INLINE void scaledconvolve_horiz_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
 
   src -= SUBPEL_TAPS / 2 - 1;
 
-  y = h;
-  do {
-    int x_q4 = x0_q4;
-    x = 0;
+  if (w == 4) {
     do {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8], d;
-          int16x8_t ss[4];
-          int16x4_t t[8], tt;
-
-          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
-          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
-          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-          t[0] = vget_low_s16(ss[0]);
-          t[1] = vget_low_s16(ss[1]);
-          t[2] = vget_low_s16(ss[2]);
-          t[3] = vget_low_s16(ss[3]);
-          t[4] = vget_high_s16(ss[0]);
-          t[5] = vget_high_s16(ss[1]);
-          t[6] = vget_high_s16(ss[2]);
-          t[7] = vget_high_s16(ss[3]);
-
-          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters);
-          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3;
+          load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+          transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+          int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+          int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+          int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d0 =
+              vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+          store_u8_4x1(&temp[4 * r], d0);
         } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 4; ++c) {
+            temp[r * 4 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 4x4 filters values back to dst
-      {
-        const uint8x8x4_t d4 = vld4_u8(temp);
-        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[0]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[1]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[2]), 0);
-        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
-                      vreinterpret_u32_u8(d4.val[3]), 0);
-      }
-      x += 4;
-    } while (x < w);
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
 
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-    y -= 4;
-  } while (y > 0);
-}
+      transpose_u8_4x4(&d01, &d23);
 
-static INLINE void scaledconvolve_horiz_w8(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-    const int x0_q4, const int x_step_q4, const int w, const int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
+      store_u8_4x1(dst + 0 * dst_stride, d01);
+      store_u8_4x1(dst + 1 * dst_stride, d23);
+      store_u8_4x1_high(dst + 2 * dst_stride, d01);
+      store_u8_4x1_high(dst + 3 * dst_stride, d23);
 
-  // This function processes 8x8 areas. The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = (h + 7) & ~7;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
 
   do {
     int x_q4 = x0_q4;
-    x = 0;
+    uint8_t *d = dst;
+    int width = w;
+
     do {
-      uint8x8_t d[8];
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
 
         if (x_q4 & SUBPEL_MASK) {
-          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          uint8x8_t s[8];
-          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
-                      &s[5], &s[6], &s[7]);
-          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                           &s[7]);
-          d[0] = scale_filter_8(s, filters);
-          vst1_u8(&temp[8 * z], d[0]);
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+          vst1_u8(&temp[r * 8], d0);
         } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 8; ++c) {
+            temp[r * 8 + c] = s[c * src_stride];
           }
         }
         x_q4 += x_step_q4;
       }
 
-      // transpose the 8x8 filters values back to dst
-      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
-                  &d[7]);
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
-      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
-      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
-      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
-      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
-      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
-      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
-      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
-      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
-      x += 8;
-    } while (x < w);
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-static INLINE void scaledconvolve_vert_w4(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
-  int y_q4 = y0_q4;
+      transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      int16x4_t t[8], tt;
-
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
-      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
-      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
-      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
-      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
-      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
-      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
-      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
-      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
+      d += 8;
+      width -= 8;
+    } while (width != 0);
 
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
 }
 
-static INLINE void scaledconvolve_vert_w8(
+static INLINE void scaledconvolve_vert_neon(
     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int y;
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
   int y_q4 = y0_q4;
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
-  do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    if (y_q4 & SUBPEL_MASK) {
-      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      uint8x8_t s[8], d;
-      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                  &s[6], &s[7]);
-      d = scale_filter_8(s, filters);
-      vst1_u8(dst, d);
-    } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
-    }
-    dst += dst_stride;
-    y_q4 += y_step_q4;
-  } while (--y);
-}
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
 
-static INLINE void scaledconvolve_vert_w16(
-    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
-    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
-    const int y0_q4, const int y_step_q4, const int w, const int h) {
-  int x, y;
-  int y_q4 = y0_q4;
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+        int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+        int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+        int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+        int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d0 =
+            vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  if (w == 8) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
 
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  y = h;
   do {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
     if (y_q4 & SUBPEL_MASK) {
-      x = 0;
       do {
-        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-        uint8x16_t ss[8];
-        uint8x8_t s[8], d[2];
-        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
-                     &ss[5], &ss[6], &ss[7]);
-        s[0] = vget_low_u8(ss[0]);
-        s[1] = vget_low_u8(ss[1]);
-        s[2] = vget_low_u8(ss[2]);
-        s[3] = vget_low_u8(ss[3]);
-        s[4] = vget_low_u8(ss[4]);
-        s[5] = vget_low_u8(ss[5]);
-        s[6] = vget_low_u8(ss[6]);
-        s[7] = vget_low_u8(ss[7]);
-        d[0] = scale_filter_8(s, filters);
-
-        s[0] = vget_high_u8(ss[0]);
-        s[1] = vget_high_u8(ss[1]);
-        s[2] = vget_high_u8(ss[2]);
-        s[3] = vget_high_u8(ss[3]);
-        s[4] = vget_high_u8(ss[4]);
-        s[5] = vget_high_u8(ss[5]);
-        s[6] = vget_high_u8(ss[6]);
-        s[7] = vget_high_u8(ss[7]);
-        d[1] = scale_filter_8(s, filters);
-        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
-        src_y += 16;
-        x += 16;
-      } while (x < w);
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+        s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+        s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+        s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+        s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+        s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+        s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+        s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+        s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+        s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+        s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+        s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+        s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+        uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                   s6[0], s7[0], filter);
+        uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                   s6[1], s7[1], filter);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     } else {
-      memcpy(dst, &src_y[3 * src_stride], w);
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        vst1q_u8(d, s0);
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
     }
-    dst += dst_stride;
+
     y_q4 += y_step_q4;
-  } while (--y);
+    dst += dst_stride;
+  } while (--h != 0);
 }
 
 void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
   //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
+  // Deriving the maximum number of rows in the im_block buffer (135):
   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   // --Largest block size is 64x64 pixels.
   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
@@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // When calling in frame scaling function, the smallest scaling factor is x1/4
   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
   // big enough.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
-  const int intermediate_height =
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
 
   assert(w <= 64);
   assert(h <= 64);
   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
   assert(x_step_q4 <= 64);
 
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
-                            intermediate_height);
-  }
+  scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, im_block, im_stride, filter, x0_q4,
+                            x_step_q4, w, im_height);
 
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, filter, y0_q4, y_step_q4, w, h);
-  }
+  scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+                           y_step_q4, w, h);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
index 2bee91f449..916dc62cef 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_SVE)   += arm/highbd_vpx_convolve8_sve.c
+DSP_SRCS-$(HAVE_SVE2)  += arm/highbd_vpx_convolve8_sve2.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
@@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
 DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
-DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
-DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
@@ -374,6 +373,7 @@ DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SVE)    += arm/sum_squares_sve.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
@@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
+DSP_SRCS-$(HAVE_SVE)    += arm/highbd_variance_sve.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
index 030c456d39..2b8c656afb 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
+void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 18087e25d9..f40f85c036 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64";
 
   add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
-  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
+  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
@@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride,
 specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
@@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x4 neon/;
+  specialize qw/vpx_highbd_12_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance4x8 neon/;
+  specialize qw/vpx_highbd_12_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance4x4 neon/;
+  specialize qw/vpx_highbd_12_variance4x4 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x4 neon/;
+  specialize qw/vpx_highbd_10_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance4x8 neon/;
+  specialize qw/vpx_highbd_10_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance4x4 neon/;
+  specialize qw/vpx_highbd_10_variance4x4 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x4 neon/;
+  specialize qw/vpx_highbd_8_variance8x4 neon sve/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance4x8 neon/;
+  specialize qw/vpx_highbd_8_variance4x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance4x4 neon/;
+  specialize qw/vpx_highbd_8_variance4x4 neon sve/;
 
   add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x8 neon/;
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x16 neon/;
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x8 neon/;
+  specialize qw/vpx_highbd_10_mse16x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x16 neon/;
+  specialize qw/vpx_highbd_10_mse8x16 neon sve/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x8 neon/;
+  specialize qw/vpx_highbd_12_mse16x8 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x16 neon/;
+  specialize qw/vpx_highbd_12_mse8x16 neon sve/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
   specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
index 0cddcb6991..eb8ff06cd7 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -28,7 +28,6 @@ extern "C" {
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
-  assert(filter[3] != 128);
   if (filter[0] | filter[7]) {
     return 8;
   }
diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
index 539d09bb39..eba12d312a 100644
--- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
+++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
@@ -15,7 +15,7 @@
 #include <sys/sysctl.h>
 #endif
 
-#if !CONFIG_RUNTIME_CPU_DETECT
+#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
 
 static int arm_get_cpu_caps(void) {
   // This function should actually be a no-op. There is no way to adjust any of
@@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) {
   return flags;
 }
 
-#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
 
 // sysctlbyname() parameter documentation for instruction set characteristics:
 // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
@@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) {
 // hwcap values are not defined should not prevent features from being enabled.
 #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
 #define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1)
 #define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
 
 static int arm_get_cpu_caps(void) {
   int flags = 0;
+#if HAVE_NEON_DOTPROD || HAVE_SVE
   unsigned long hwcap = getauxval(AT_HWCAP);
-#if HAVE_NEON_I8MM
+#endif  // HAVE_NEON_DOTPROD || HAVE_SVE
+#if HAVE_NEON_I8MM || HAVE_SVE2
   unsigned long hwcap2 = getauxval(AT_HWCAP2);
-#endif  // HAVE_NEON_I8MM
+#endif  // HAVE_NEON_I8MM || HAVE_SVE2
 #if HAVE_NEON
   flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
 #endif  // HAVE_NEON
@@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) {
     flags |= HAS_SVE;
   }
 #endif  // HAVE_SVE
+#if HAVE_SVE2
+  if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) {
+    flags |= HAS_SVE2;
+  }
+#endif  // HAVE_SVE2
   return flags;
 }
 
@@ -195,5 +203,10 @@ int arm_cpu_caps(void) {
     flags &= ~HAS_SVE;
   }
 
+  // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available.
+  if (!(flags & HAS_SVE)) {
+    flags &= ~HAS_SVE2;
+  }
+
   return flags;
 }
diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h
index 39365d18ee..814c3cc408 100644
--- a/media/libvpx/libvpx/vpx_ports/arm.h
+++ b/media/libvpx/libvpx/vpx_ports/arm.h
@@ -25,6 +25,8 @@ extern "C" {
 #define HAS_NEON_I8MM (1 << 2)
 // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
 #define HAS_SVE (1 << 3)
+// Armv9.0-A SVE2 instructions.
+#define HAS_SVE2 (1 << 4)
 
 int arm_cpu_caps(void);
 
diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
index f1036b98ed..79b98a75f1 100644
--- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
@@ -12,4 +12,4 @@
 
 #include "vpx_ports/system_state.h"
 
-void vpx_clear_system_state() { _mm_empty(); }
+void vpx_clear_system_state(void) { _mm_empty(); }
diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h
index 5eccfe8f50..ee9e095633 100644
--- a/media/libvpx/libvpx/vpx_ports/mem.h
+++ b/media/libvpx/libvpx/vpx_ports/mem.h
@@ -23,7 +23,13 @@
 #define DECLARE_ALIGNED(n, typ, val) typ val
 #endif
 
-#if HAVE_NEON && defined(_MSC_VER)
+#if defined(__has_builtin)
+#define VPX_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define VPX_HAS_BUILTIN(x) 0
+#endif
+
+#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__)
 #define __builtin_prefetch(x)
 #endif
 
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h
index d8a8ed89fe..d33eff4397 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_once.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h
@@ -91,29 +91,6 @@ static void once(void (*func)(void)) {
   return;
 }
 
-#elif CONFIG_MULTITHREAD && defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>
-static void once(void (*func)(void)) {
-  static volatile int done;
-
-  /* If the initialization is complete, return early. */
-  if (done) return;
-
-  /* Causes all other threads in the process to block themselves
-   * and give up their time slice.
-   */
-  DosEnterCritSec();
-
-  if (!done) {
-    func();
-    done = 1;
-  }
-
-  /* Restores normal thread dispatching for the current process. */
-  DosExitCritSec();
-}
-
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void)) {
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
index dc4d9593a8..706b0770c8 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd() { once(setup_rtcd_internal); }
+void vpx_scale_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
new file mode 100644
index 0000000000..cdd18d0f30
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
@@ -0,0 +1,157 @@
+// Copyright 2024 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// pthread.h wrapper
+
+#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_
+#define VPX_VPX_UTIL_VPX_PTHREAD_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_MULTITHREAD
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <errno.h>    // NOLINT
+#include <process.h>  // NOLINT
+#include <stddef.h>   // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_EXIT_SUCCESS 0
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 unsigned int(__stdcall *start)(void *),
+                                 void *arg) {
+  (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,          /* lpThreadAttributes */
+                         0,             /* dwStackSize */
+                         start, arg, 0, /* dwStackSize */
+                         NULL);         /* lpThreadId */
+#else
+  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
+                                      0,             /* unsigned stack_size */
+                                      start, arg, 0, /* unsigned initflag */
+                                      NULL);         /* unsigned *thrdaddr */
+#endif
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+              WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  (void)condition;
+  return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  (void)cond_attr;
+  InitializeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  WakeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  WakeAllConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+  return !ok;
+}
+#else                 // _WIN32
+#include <pthread.h>  // NOLINT
+#define THREADFN void *
+#define THREAD_EXIT_SUCCESS NULL
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+#endif  // VPX_VPX_UTIL_VPX_PTHREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
index 04c5fb6f26..0d0e2f5766 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.c
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -12,10 +12,18 @@
 // Original source:
 //  https://chromium.googlesource.com/webm/libwebp
 
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <assert.h>
 #include <string.h>  // for memset()
+#include "./vpx_config.h"
 #include "./vpx_thread.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 
 #if CONFIG_MULTITHREAD
 
@@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker);  // Forward declaration.
 
 static THREADFN thread_loop(void *ptr) {
   VPxWorker *const worker = (VPxWorker *)ptr;
-  int done = 0;
-  while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    while (worker->status_ == OK) {  // wait in idling mode
+#ifdef __APPLE__
+  if (worker->thread_name != NULL) {
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. The maximum size of the thread_name buffer was
+    // noted in the Chromium source code and was confirmed by experiments. If
+    // thread_name is too long, pthread_setname_np returns -1 with errno
+    // ENAMETOOLONG (63).
+    char thread_name[64];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(thread_name);
+  }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+  if (worker->thread_name != NULL) {
+    // Linux and Android require names (with nul) fit in 16 chars, otherwise
+    // pthread_setname_np() returns ERANGE (34).
+    char thread_name[16];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(pthread_self(), thread_name);
+  }
+#endif
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  for (;;) {
+    while (worker->status_ == VPX_WORKER_STATUS_OK) {  // wait in idling mode
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
-    if (worker->status_ == WORK) {
+    if (worker->status_ == VPX_WORKER_STATUS_WORKING) {
+      // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread
+      // doesn't change worker->status_ and will wait until the worker changes
+      // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the
+      // worker can safely call execute() without holding worker->impl_->mutex_.
+      // When the worker reacquires worker->impl_->mutex_, worker->status_ must
+      // still be VPX_WORKER_STATUS_WORKING.
+      pthread_mutex_unlock(&worker->impl_->mutex_);
       execute(worker);
-      worker->status_ = OK;
-    } else if (worker->status_ == NOT_OK) {  // finish the worker
-      done = 1;
+      pthread_mutex_lock(&worker->impl_->mutex_);
+      assert(worker->status_ == VPX_WORKER_STATUS_WORKING);
+      worker->status_ = VPX_WORKER_STATUS_OK;
+      // signal to the main thread that we're done (for sync())
+      pthread_cond_signal(&worker->impl_->condition_);
+    } else {
+      assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);  // finish the worker
+      break;
     }
-    // signal to the main thread that we're done (for sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
-  return THREAD_RETURN(NULL);  // Thread is finished
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+  return THREAD_EXIT_SUCCESS;  // Thread is finished
 }
 
 // main thread state control
@@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
   if (worker->impl_ == NULL) return;
 
   pthread_mutex_lock(&worker->impl_->mutex_);
-  if (worker->status_ >= OK) {
+  if (worker->status_ >= VPX_WORKER_STATUS_OK) {
     // wait for the worker to finish
-    while (worker->status_ != OK) {
+    while (worker->status_ != VPX_WORKER_STATUS_OK) {
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     // assign new status and release the working thread if needed
-    if (new_status != OK) {
+    if (new_status != VPX_WORKER_STATUS_OK) {
       worker->status_ = new_status;
       pthread_cond_signal(&worker->impl_->condition_);
     }
@@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
 
 static void init(VPxWorker *const worker) {
   memset(worker, 0, sizeof(*worker));
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
 }
 
 static int sync(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, OK);
+  change_state(worker, VPX_WORKER_STATUS_OK);
 #endif
-  assert(worker->status_ <= OK);
+  assert(worker->status_ <= VPX_WORKER_STATUS_OK);
   return !worker->had_error;
 }
 
 static int reset(VPxWorker *const worker) {
   int ok = 1;
   worker->had_error = 0;
-  if (worker->status_ < OK) {
+  if (worker->status_ < VPX_WORKER_STATUS_OK) {
 #if CONFIG_MULTITHREAD
     worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
     if (worker->impl_ == NULL) {
@@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) {
     }
     pthread_mutex_lock(&worker->impl_->mutex_);
     ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
-    if (ok) worker->status_ = OK;
+    if (ok) worker->status_ = VPX_WORKER_STATUS_OK;
     pthread_mutex_unlock(&worker->impl_->mutex_);
     if (!ok) {
       pthread_mutex_destroy(&worker->impl_->mutex_);
@@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) {
       return 0;
     }
 #else
-    worker->status_ = OK;
+    worker->status_ = VPX_WORKER_STATUS_OK;
 #endif
-  } else if (worker->status_ > OK) {
+  } else if (worker->status_ > VPX_WORKER_STATUS_OK) {
     ok = sync(worker);
   }
-  assert(!ok || (worker->status_ == OK));
+  assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK));
   return ok;
 }
 
@@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) {
 
 static void launch(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, WORK);
+  change_state(worker, VPX_WORKER_STATUS_WORKING);
 #else
   execute(worker);
 #endif
@@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) {
 static void end(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   if (worker->impl_ != NULL) {
-    change_state(worker, NOT_OK);
+    change_state(worker, VPX_WORKER_STATUS_NOT_OK);
     pthread_join(worker->impl_->thread_, NULL);
     pthread_mutex_destroy(&worker->impl_->mutex_);
     pthread_cond_destroy(&worker->impl_->condition_);
@@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) {
     worker->impl_ = NULL;
   }
 #else
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
   assert(worker->impl_ == NULL);
 #endif
-  assert(worker->status_ == NOT_OK);
+  assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);
 }
 
 //------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
index 6d308e949b..11a1d74387 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.h
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -15,370 +15,22 @@
 #ifndef VPX_VPX_UTIL_VPX_THREAD_H_
 #define VPX_VPX_UTIL_VPX_THREAD_H_
 
-#include "./vpx_config.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
-
-#if CONFIG_MULTITHREAD
-
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-#include <errno.h>    // NOLINT
-#include <process.h>  // NOLINT
-#include <windows.h>  // NOLINT
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
-typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
-#endif  // _WIN32_WINNT >= 0x600
-
-#ifndef WINAPI_FAMILY_PARTITION
-#define WINAPI_PARTITION_DESKTOP 1
-#define WINAPI_FAMILY_PARTITION(x) x
-#endif
-
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define USE_CREATE_THREAD
-#endif
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-// _beginthreadex requires __stdcall
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREADFN unsigned int __stdcall
-#endif
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
-  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 unsigned int(__stdcall *start)(void *),
-                                 void *arg) {
-  (void)attr;
-#ifdef USE_CREATE_THREAD
-  *thread = CreateThread(NULL,          /* lpThreadAttributes */
-                         0,             /* dwStackSize */
-                         start, arg, 0, /* dwStackSize */
-                         NULL);         /* lpThreadId */
-#else
-  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
-                                      0,             /* unsigned stack_size */
-                                      start, arg, 0, /* unsigned initflag */
-                                      NULL);         /* unsigned *thrdaddr */
-#endif
-  if (*thread == NULL) return 1;
-  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
-          CloseHandle(thread) == 0);
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
-  InitializeCriticalSection(mutex);
-#endif
-  return 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  EnterCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  LeaveCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  DeleteCriticalSection(mutex);
-  return 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  (void)condition;
-#else
-  ok &= (CloseHandle(condition->waiting_sem_) != 0);
-  ok &= (CloseHandle(condition->received_sem_) != 0);
-  ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  InitializeConditionVariable(condition);
-#else
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
-  if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
-      condition->signal_event_ == NULL) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-#endif
-  return 0;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  WakeAllConditionVariable(condition);
-#else
-  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok &= SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  WakeConditionVariable(condition);
-#else
-  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok = SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
-  // note that there is a consumer available so the signal isn't dropped in
-  // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
-  // now unlock the mutex so pthread_cond_signal may be issued
-  pthread_mutex_unlock(mutex);
-  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
-        WAIT_OBJECT_0);
-  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
-  pthread_mutex_lock(mutex);
-#endif
-  return !ok;
-}
-
-#elif defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>  // NOLINT
-
-#include <errno.h>        // NOLINT
-#include <stdlib.h>       // NOLINT
-#include <sys/builtin.h>  // NOLINT
-
-#if defined(__STRICT_ANSI__)
-// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
-int _beginthread(void (*)(void *), void *, unsigned, void *);
-#endif
-
-#define pthread_t TID
-#define pthread_mutex_t HMTX
-
-typedef struct {
-  HEV event_sem_;
-  HEV ack_sem_;
-  volatile unsigned wait_count_;
-} pthread_cond_t;
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#define THREADFN void *
-#define THREAD_RETURN(val) (val)
-
-typedef struct {
-  void *(*start_)(void *);
-  void *arg_;
-} thread_arg;
-
-static void thread_start(void *arg) {
-  thread_arg targ = *(thread_arg *)arg;
-  free(arg);
-
-  targ.start_(targ.arg_);
-}
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 void *(*start)(void *), void *arg) {
-  int tid;
-  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
-  if (targ == NULL) return 1;
-
-  (void)attr;
-
-  targ->start_ = start;
-  targ->arg_ = arg;
-  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
-  if (tid == -1) {
-    free(targ);
-    return 1;
-  }
-
-  *thread = tid;
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return DosWaitThread(&thread, DCWW_WAIT) != 0;
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  return DosReleaseMutexSem(*mutex) != 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  return DosCloseMutexSem(*mutex) != 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-  ok &= DosCloseEventSem(condition->event_sem_) == 0;
-  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  int ok = 1;
-  (void)cond_attr;
-
-  ok &=
-      DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
-  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
-  if (!ok) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-  condition->wait_count_ = 0;
-  return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
-    ok &= DosPostEventSem(condition->event_sem_) == 0;
-    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
-  }
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
-    ok &= pthread_cond_signal(condition) == 0;
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok = 1;
-
-  __atomic_increment(&condition->wait_count_);
-
-  ok &= pthread_mutex_unlock(mutex) == 0;
-
-  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
-
-  __atomic_decrement(&condition->wait_count_);
-
-  ok &= DosPostEventSem(condition->ack_sem_) == 0;
-
-  pthread_mutex_lock(mutex);
-
-  return !ok;
-}
-#else                 // _WIN32
-#include <pthread.h>  // NOLINT
-#define THREADFN void *
-#define THREAD_RETURN(val) val
-#endif
-
-#endif  // CONFIG_MULTITHREAD
+#define MAX_NUM_THREADS 64
 
 // State of the worker thread object
 typedef enum {
-  NOT_OK = 0,  // object is unusable
-  OK,          // ready to work
-  WORK         // busy finishing the current task
+  VPX_WORKER_STATUS_NOT_OK = 0,  // object is unusable
+  VPX_WORKER_STATUS_OK,          // ready to work
+  VPX_WORKER_STATUS_WORKING      // busy finishing the current task
 } VPxWorkerStatus;
 
 // Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
 typedef int (*VPxWorkerHook)(void *, void *);
 
 // Platform-dependent implementation details for the worker.
@@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl;
 typedef struct {
   VPxWorkerImpl *impl_;
   VPxWorkerStatus status_;
+  // Thread name for the debugger. If not NULL, must point to a string that
+  // outlives the worker thread. For portability, use a name <= 15 characters
+  // long (not including the terminating NUL character).
+  const char *thread_name;
   VPxWorkerHook hook;  // hook to call
   void *data1;         // first argument passed to 'hook'
   void *data2;         // second argument passed to 'hook'
-  int had_error;       // return value of the last call to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
 } VPxWorker;
 
 // The interface for all thread-worker related functions. All these functions
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
index 1162714956..948e6d6f89 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_util.mk
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -10,6 +10,7 @@
 
 UTIL_SRCS-yes += vpx_atomics.h
 UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_pthread.h
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
diff --git a/media/libvpx/missing_header.patch b/media/libvpx/missing_header.patch
new file mode 100644
index 0000000000..02b77170ee
--- /dev/null
+++ b/media/libvpx/missing_header.patch
@@ -0,0 +1,12 @@
+Add missing header for EBUSY
+
+--- a/vpx_util/vpx_pthread.h
++++ b/vpx_util/vpx_pthread.h
+@@ -26,6 +26,7 @@ extern "C" {
+ #define NOMINMAX
+ #undef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
++#include <errno.h>    // NOLINT
+ #include <process.h>  // NOLINT
+ #include <stddef.h>   // NOLINT
+ #include <windows.h>  // NOLINT
diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
index 582bc6fd5d..635b5d0fdd 100644
--- a/media/libvpx/moz.build
+++ b/media/libvpx/moz.build
@@ -72,7 +72,10 @@ elif CONFIG['TARGET_CPU'] == 'arm':
         ]
 elif CONFIG['TARGET_CPU'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT':
     EXPORTS.vpx += files['ARM64_EXPORTS']
-    SOURCES += files['ARM64_SOURCES']
+    # Bug 1885585: clang on win/aarch64 cannot compile SVInt8_t type for now.
+    SOURCES += [
+            f for f in files['ARM64_SOURCES'] if not f.endswith('_sve.c')
+        ]
     ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ]
     LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ]
     SOURCES += [ '/media/libvpx/config/win/aarch64/vpx_config.c' ]
@@ -125,6 +128,10 @@ for f in SOURCES:
             SOURCES[f].flags += ['-march=armv8.2-a+dotprod']
         if 'neon_i8mm.c' in f:
             SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm']
+        if 'sve.c' in f:
+            SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve']
+        if 'sve2.c' in f:
+            SOURCES[f].flags += ['-march=armv9-a+sve2']
 
 # Suppress warnings in third-party code.
 CFLAGS += [
diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml
index 17704a1905..0b3ec52482 100644
--- a/media/libvpx/moz.yaml
+++ b/media/libvpx/moz.yaml
@@ -20,11 +20,11 @@ origin:
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d (Tue Jan 02 20:08:06 2024).
+  release: 7fb8ceccf92c35cd5131b05c0502916715ebc76b (Fri Mar 15 01:11:50 2024).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d
+  revision: 7fb8ceccf92c35cd5131b05c0502916715ebc76b
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
@@ -53,8 +53,10 @@ vendoring:
     - tools/
 
   patches:
+    - arm_cpu_runtime_detection_code_on_openbsd.patch
     - input_frame_validation.patch
     - input_frame_validation_vp9.patch
+    - missing_header.patch
 
   update-actions:
     - action: move-file
diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild
index 2960dee255..1ad5d4447c 100644
--- a/media/libvpx/sources.mozbuild
+++ b/media/libvpx/sources.mozbuild
@@ -934,6 +934,7 @@ files = {
     'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c',
     'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c',
     'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c',
     'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c',
     'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c',
     'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
@@ -1006,6 +1007,7 @@ files = {
     'libvpx/vpx_dsp/arm/subpel_variance_neon.c',
     'libvpx/vpx_dsp/arm/subtract_neon.c',
     'libvpx/vpx_dsp/arm/sum_squares_neon.c',
+    'libvpx/vpx_dsp/arm/sum_squares_sve.c',
     'libvpx/vpx_dsp/arm/variance_neon.c',
     'libvpx/vpx_dsp/arm/variance_neon_dotprod.c',
     'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c',
@@ -1014,8 +1016,6 @@ files = {
     'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c',
     'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c',
     'libvpx/vpx_dsp/arm/vpx_convolve_neon.c',
-    'libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c',
-    'libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c',
     'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c',
     'libvpx/vpx_dsp/avg.c',
     'libvpx/vpx_dsp/bitreader.c',